From b36b68fff3bc4bfb171ffa1e23597309d5d52cd6 Mon Sep 17 00:00:00 2001
From: qiaojbao <Qiaojin.Bao@amd.com>
Date: Wed, 30 Oct 2024 16:21:51 +0800
Subject: [PATCH] Update llpc from commit 4bd41bcf

Set SLC=0 for ATM
LowerGpuRt: fix a type confusion
Honor NonUniform decorations on OpAccessChain operands
Force WGP when NGG is in passthrough mode
Use idxen modifier for (RW)StructuredBuffers
Compatible with LLVM upstream change that moved Utility funcs to Utils
Include Line Correlation Info for Tools via LLPC
[Continuations] Cleanup Complete op lowering
[CompilerUtils] Improve handling of freeze in ValueOriginTracker
[compilerutils] Add GetGlobalInModule to CrossModuleInliner
amdllpc: some usability improvements
[Continuations] Add SpecializeDriverShadersPass
Split up gl_in array type.
[Continuations] Remove stack lowering define
Use inbounds gep to index row major matrix
llvmraytracing: Remove support for _AmdContinuationsGetFlags
Properly encapsulate state that affects GPURT specialization
llvmraytracing: only support a waitmask of -1
llvmraytracing: remove remaining traces of EnqueueCall
compilerutils: fix a warning
[LowerGlobals] Mark globals with `buffer.index` users as readonly
Implement structural GEP dialect for in/out
llvmraytracing: add LLPC/LLVM scratch/global address spaces
llpc/ProcessGpuRtLibrary: use earlyGpurtTransform
[Continuations] Make use of `llvm::zip` in `CleanupContinuationsPass::updateCpsFunctionArgs`
Adjust the llvm-tblgen memory limit
lgc: split lowering of GroupMemcpyOp for mesh/task shaders to MeshTaskShader
PatchResourceCollect: stop cleaning undef output value in some cases
[Continuations] Freeze poison that is stored to payload
Fix wrong pipeline dump message
Fix issues with coherent
Update SPIR-V header to latest version
Simplify load of attribute ring buffer descriptor
[Continuations] Enable Traversal specialization test
Adjust tests to take into account upstream overload of rsrc and samp arguments
[Continuations] Fix signed/unsigned comparison warning
LowerRaytracingPipeline: tease apart return handling and any-hit exit handling
LowerRaytracingPipeline: unify function end handling
lgc: support int4 in cooperative matrix
Add rounding mode for PackHalf2x16
[Continuations] Fix unused variable warning
lgc: Add MsgPackScanner
[Continuations] Remove `LegacyCleanupContinuations` pass
LowerCooperativeMatrix: fix compile warning due to missing break statement
lgc: New RegStackUsage to propagate reg/stack usage
[LGC] Add helper lane state for subgroups
Add client name to SPIR-V frontend
Use more flexible readfirstlane
Gate argument definition for `GEP::collectOffset`.
[Continuations] Introduce dummy `csp` argument for `lgc.cps.jump`
[Continuations] Tolerate non-waiting AwaitTraversal
Add AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx3
Add more BuiltIn which has primitive index in mesh shader
Rewrite the helper getShaderModuleUsageInfo
[Continuations] Replace isLgcRtOp helper with isDialectOp helper from dialects
Downgrade scope Device to Workgroup if permitted
Optimize performance for PrimSetup
Update submodule llvm-dialects
Remove inactive built-ins for last vertex processing stage
Fix internal hash for color export shader
Fix call to lookupIntrinsicID
[Continuations] Move Simplifying GEP helpers to CompilerUtils
[PatchBufferOp] Generate struct buffer cmpxchg intrinsics
[LGC] Refactor default wave size setting
[Continuations] Derive `DispatchSystemData` type from `_cont_DispatchRaysIndex3`
[Continuations] Replace "lgc.rt" starts_with check
Rename some classes and files
Optimize PointSize write when the value is 1.0
[Continuations] Handle `_AmdGetShaderRecordIndex` calls
[LowerBufferOperations] Check for uniform buffer pointers for s_buffer_load
lgc: Mark applicable LgcDialect and Builder ops as NoDivergenceSource
---
 CMakeLists.txt                                |   11 +-
 cmake/LlvmMainRevision.cmake                  |   44 +
 .../include/compilerutils/CompilerUtils.h     |   40 +-
 .../include/compilerutils/DxilUtils.h         |   64 +
 .../compilerutils/ValueOriginTracking.h       |   50 +-
 compilerutils/lib/CompilerUtils.cpp           |  254 +-
 compilerutils/lib/ValueOriginTracking.cpp     |   35 +-
 .../lib/ValueOriginTrackingTestPass.cpp       |   56 +-
 .../lib/ValueSpecializationTestPass.cpp       |  100 +-
 .../inc/link-struct-ptr-argument.ll           |   12 +
 .../link-struct-ptr-argument.ll               |   32 +
 .../test/value-origin-tracking/basic-tests.ll |    3 +-
 .../test/value-origin-tracking/freeze-mode.ll |   70 +
 .../cross-module-inline.cpp                   |    2 +-
 docs/ComputeShaderDerivateGroups.md           |    4 +-
 gfxruntime/CMakeLists.txt                     |    1 +
 imported/llvm-dialects                        |    2 +-
 include/khronos/spirv/GLSL.std.450.h          |    2 +-
 include/khronos/spirv/NonSemanticDebugBreak.h |    2 +-
 .../khronos/spirv/NonSemanticDebugPrintf.h    |   12 +-
 .../spirv/NonSemanticShaderDebugInfo100.h     |    2 +-
 include/khronos/spirv/spirv.hpp               | 1883 +++++++++++++-
 include/vkgcDefs.h                            |    2 +
 lgc/CMakeLists.txt                            |   33 +-
 lgc/builder/BuilderRecorder.cpp               |   40 +-
 lgc/builder/ImageBuilder.cpp                  |   17 +-
 lgc/builder/InOutBuilder.cpp                  |    6 +-
 lgc/builder/MatrixBuilder.cpp                 |   35 +-
 lgc/builder/SubgroupBuilder.cpp               |  188 +-
 lgc/elfLinker/ColorExportShader.cpp           |    2 +-
 lgc/elfLinker/ColorExportShader.h             |    2 +-
 lgc/elfLinker/NullFragmentShader.cpp          |    2 +-
 lgc/include/lgc/builder/BuilderImpl.h         |   54 +-
 lgc/include/lgc/builder/SubgroupBuilder.h     |    7 +-
 lgc/include/lgc/patch/AddLoopMetadata.h       |    6 +-
 ...{PatchWorkarounds.h => ApplyWorkarounds.h} |    2 +-
 .../lgc/patch/CollectImageOperations.h        |    4 +-
 ...sourceCollect.h => CollectResourceUsage.h} |    2 +-
 ...ragColorExport.h => FragmentColorExport.h} |    2 +-
 lgc/include/lgc/patch/IncludeLlvmIr.h         |    4 +-
 ...atchBufferOp.h => LowerBufferOperations.h} |    5 +-
 lgc/include/lgc/patch/LowerInOut.h            |    6 +-
 lgc/include/lgc/patch/LowerMulDx9Zero.h       |    6 +-
 ...chReadFirstLane.h => LowerReadFirstLane.h} |    2 +-
 lgc/include/lgc/patch/MutateEntryPoint.h      |    9 +-
 ...hroughShader.h => PassthroughHullShader.h} |    2 +-
 ...chPeepholeOpt.h => PeepholeOptimization.h} |    2 +-
 ...parePipelineAbi.h => PreparePipelineAbi.h} |    2 +-
 lgc/include/lgc/patch/ScalarizeLoads.h        |    7 +-
 ...TargetFeatures.h => SetupTargetFeatures.h} |    2 +-
 lgc/include/lgc/patch/StructurizeBuffers.h    |   47 +
 lgc/include/lgc/state/AbiMetadata.h           |    5 +-
 lgc/include/lgc/state/PipelineState.h         |   10 +-
 lgc/include/lgc/state/TargetInfo.h            |    1 +
 lgc/include/lgc/util/Internal.h               |    9 +-
 lgc/include/lgc/util/MsgPackScanner.h         |  222 ++
 lgc/interface/lgc/BuilderCommon.h             |    3 +
 lgc/interface/lgc/LgcDialect.h                |    1 +
 lgc/interface/lgc/LgcDialect.td               |   86 +-
 lgc/interface/lgc/Pipeline.h                  |   14 +-
 lgc/interface/lgc/RegStackUsage.h             |   82 +
 lgc/patch/AddLoopMetadata.cpp                 |   16 +-
 ...chWorkarounds.cpp => ApplyWorkarounds.cpp} |    4 +-
 lgc/patch/CollectImageOperations.cpp          |    8 +-
 ...ceCollect.cpp => CollectResourceUsage.cpp} |  172 +-
 lgc/patch/ConfigBuilderBase.cpp               |   11 +
 lgc/patch/ConfigBuilderBase.h                 |    1 +
 lgc/patch/Continufy.cpp                       |    5 +-
 ...olorExport.cpp => FragmentColorExport.cpp} |    4 +-
 lgc/patch/GenerateCopyShader.cpp              |   20 +-
 ...der.cpp => GenerateNullFragmentShader.cpp} |    6 +-
 ...gShader.h => GenerateNullFragmentShader.h} |    2 +-
 lgc/patch/IncludeLlvmIr.cpp                   |    6 +-
 ...BufferOp.cpp => LowerBufferOperations.cpp} |  154 +-
 lgc/patch/LowerCooperativeMatrix.cpp          |  141 +-
 lgc/patch/LowerDebugPrintf.cpp                |    2 +-
 lgc/patch/LowerGpuRt.cpp                      |   19 +-
 lgc/patch/LowerInOut.cpp                      |  233 +-
 lgc/patch/LowerMulDx9Zero.cpp                 |   16 +-
 ...adFirstLane.cpp => LowerReadFirstLane.cpp} |   31 +-
 lgc/patch/MeshTaskShader.cpp                  |   46 +-
 lgc/patch/MeshTaskShader.h                    |    3 +-
 lgc/patch/MutateEntryPoint.cpp                |  181 +-
 lgc/patch/NggPrimShader.cpp                   |  215 +-
 lgc/patch/NggPrimShader.h                     |    7 +-
 lgc/patch/PassRegistry.inc                    |   13 +-
 ...ghShader.cpp => PassthroughHullShader.cpp} |    4 +-
 lgc/patch/Patch.cpp                           |   35 +-
 ...epholeOpt.cpp => PeepholeOptimization.cpp} |    4 +-
 ...PipelineAbi.cpp => PreparePipelineAbi.cpp} |    4 +-
 lgc/patch/RegisterMetadataBuilder.cpp         |   17 +
 lgc/patch/ScalarizeLoads.cpp                  |   10 +-
 ...etFeatures.cpp => SetupTargetFeatures.cpp} |    4 +-
 lgc/patch/ShaderInputs.cpp                    |    2 +-
 lgc/patch/ShaderMerger.cpp                    |   21 +-
 lgc/patch/StructurizeBuffers.cpp              |  194 ++
 lgc/state/PalMetadata.cpp                     |    3 +
 lgc/state/PassManagerCache.cpp                |    4 +-
 lgc/state/PipelineState.cpp                   |   82 +-
 lgc/state/TargetInfo.cpp                      |   64 +-
 lgc/test/CleanUndefOutputValues.lgc           |  142 ++
 lgc/test/ImageSampleNoReturn.lgc              |    2 +-
 lgc/test/PatchInvalidImageDescriptor.lgc      |   14 +-
 lgc/test/TestWaterfallLoopForStruct.lgc       |    2 +-
 lgc/test/Transforms/Continufy/simple.lgc      |    8 +-
 .../CpsLowering/continuation-basic.lgc        |    6 +-
 .../CpsLowering/cps-entry-point.lgc           |    4 +-
 .../CpsLowering/cps-from-continufy.lgc        |   11 +-
 .../CpsLowering/cps-stack-lowering.lgc        |   22 +-
 .../CpsLowering/cps-unify-exits.lgc           |   14 +-
 .../LowerCooperativeMatrix/convert.lgc        |  211 +-
 .../LowerCooperativeMatrix/load-wave64.lgc    |   46 +
 .../packed-accumulators-wave64.lgc            |    8 +-
 .../PatchBufferOp/buffer-index-op.lgc         |   65 +
 .../PatchBufferOp/buffer.atomic.ops.lgc       |  471 ++++
 lgc/test/Transforms/ReadFirstLane/simple.lgc  |   39 +-
 .../scalarizationOfDescriptorLoadsTest1.lgc   |    4 +-
 .../scalarizationOfDescriptorLoadsTest10.lgc  |    2 +-
 .../scalarizationOfDescriptorLoadsTest11.lgc  |    2 +-
 .../scalarizationOfDescriptorLoadsTest12.lgc  |    2 +-
 .../scalarizationOfDescriptorLoadsTest13.lgc  |    2 +-
 .../scalarizationOfDescriptorLoadsTest14.lgc  |    4 +-
 .../scalarizationOfDescriptorLoadsTest15.lgc  |    4 +-
 .../scalarizationOfDescriptorLoadsTest16.lgc  |    2 +-
 .../scalarizationOfDescriptorLoadsTest2.lgc   |    2 +-
 .../scalarizationOfDescriptorLoadsTest3.lgc   |    2 +-
 .../scalarizationOfDescriptorLoadsTest4.lgc   |    2 +-
 .../scalarizationOfDescriptorLoadsTest5.lgc   |    4 +-
 .../scalarizationOfDescriptorLoadsTest6.lgc   |    8 +-
 .../scalarizationOfDescriptorLoadsTest7.lgc   |    2 +-
 .../scalarizationOfDescriptorLoadsTest8.lgc   |    2 +-
 .../scalarizationOfDescriptorLoadsTest9.lgc   |    2 +-
 lgc/unittests/CMakeLists.txt                  |    2 +
 lgc/unittests/internal/CMakeLists.txt         |   37 +
 lgc/unittests/internal/MsgPackScannerTest.cpp |  272 +++
 lgc/util/Internal.cpp                         |   35 +-
 lgc/util/MsgPackScanner.cpp                   |  675 ++++++
 lgc/util/RegStackUsage.cpp                    |  563 +++++
 llpc/CMakeLists.txt                           |  140 +-
 llpc/context/llpcCompiler.cpp                 |  181 +-
 llpc/context/llpcCompiler.h                   |   13 +-
 llpc/context/llpcComputeContext.cpp           |    5 +-
 llpc/context/llpcComputeContext.h             |    4 +-
 llpc/context/llpcContext.cpp                  |   63 +-
 llpc/context/llpcContext.h                    |   12 +-
 llpc/context/llpcDialect.cpp                  |   36 +
 llpc/context/llpcDialect.h                    |    4 +
 llpc/context/llpcGraphicsContext.cpp          |    5 +-
 llpc/context/llpcGraphicsContext.h            |    4 +-
 llpc/context/llpcPipelineContext.cpp          |   11 +-
 llpc/context/llpcPipelineContext.h            |    3 +-
 llpc/context/llpcRayTracingContext.cpp        |   12 +-
 llpc/context/llpcRayTracingContext.h          |    2 +-
 llpc/docs/DdnPackShaderInputOutput.md         |    6 +-
 llpc/include/LlpcDialect.td                   |   53 +
 llpc/include/llpc.h                           |    2 +
 llpc/lowering/LowerAccessChain.cpp            |  256 +-
 llpc/lowering/LowerAccessChain.h              |    8 +-
 llpc/lowering/LowerGlCompatibility.cpp        |   33 +-
 llpc/lowering/LowerGlobals.cpp                |   48 +-
 llpc/lowering/Lowering.cpp                    |    4 +-
 llpc/lowering/ProcessGpuRtLibrary.cpp         |  126 +-
 llpc/lowering/ProcessGpuRtLibrary.h           |   47 +-
 llpc/lowering/ScalarReplacementOfBuiltins.cpp |   80 +-
 .../core/FMA_TestOperandIsZero.spvasm         |    2 +-
 ...onUniform_TestTexutreLoadStoreInt64.spvasm |   55 +-
 ...OpAtomicAnd_TestInt64ImageAtomicAnd.spvasm |    2 +-
 ...change_TestInt64ImageAtomicCompSwap.spvasm |    2 +-
 ...change_TestInt64ImageAtomicExchange.spvasm |    2 +-
 ...pAtomicIAdd_TestInt64ImageAtomicAdd.spvasm |    2 +-
 ...ement_TestInt64ImageAtomicDecrement.spvasm |    2 +-
 ...ement_TestInt64ImageAtomicIncrement.spvasm |    2 +-
 ...pAtomicISub_TestInt64ImageAtomicSub.spvasm |    2 +-
 ...AtomicLoad_TestInt64ImageAtomicLoad.spvasm |    2 +-
 .../OpAtomicOr_TestInt64ImageAtomicOr.spvasm  |    2 +-
 ...pAtomicSMax_TestInt64ImageAtomicMax.spvasm |    2 +-
 ...pAtomicSMin_TestInt64ImageAtomicMin.spvasm |    2 +-
 ...omicStore_TestInt64ImageAtomicStore.spvasm |    2 +-
 ...pAtomicUMax_TestInt64ImageAtomicMax.spvasm |    2 +-
 ...pAtomicUMin_TestInt64ImageAtomicMin.spvasm |    2 +-
 .../OpAtomicXXX_TestImageDimension_lit.comp   |  160 +-
 ...tomicXXX_TestImageMemoryQualifier_lit.comp |    6 +-
 .../core/OpAtomicXXX_TestImage_lit.comp       |   34 +-
 .../core/OpAtomicXXX_TestImage_lit.frag       |   30 +-
 ...OpAtomicXor_TestInt64ImageAtomicXor.spvasm |    2 +-
 ...onGroup_TestGroupAndGroupMember_lit.spvasm |   14 +-
 .../shaderdb/core/OpExtInst_PackHalf2x16.comp |   25 +
 .../core/OpFMul_TestOperandIsZero.spvasm      |    2 +-
 .../core/OpImageDrefGather_TestBasic_lit.frag |    2 +-
 .../OpImageDrefGather_TestOffset_lit.frag     |    2 +-
 ...refGather_TestTextureGatherOffset_lit.frag |    6 +-
 ...efGather_TestTextureGatherOffsets_lit.frag |   24 +-
 ...ImageDrefGather_TestTextureGather_lit.frag |    6 +-
 ...mageExplicitLod_TestDrefLodOffset_lit.frag |    2 +-
 ..._Test2DMSArray_disableShadowTable_lit.frag |    2 +-
 .../core/OpImageFetch_Test2DMSArray_lit.frag  |    4 +-
 .../core/OpImageFetch_Test2DMS_lit.frag       |    2 +-
 .../core/OpImageFetch_TestBasic_lit.frag      |    2 +-
 .../OpImageFetch_TestIntegerSampler_lit.frag  |    4 +-
 .../core/OpImageFetch_TestOffset_lit.frag     |    2 +-
 ...OpImageFetch_TestTexelFetchOffset_lit.frag |   12 +-
 .../core/OpImageFetch_TestTexelFetch_lit.frag |   10 +-
 .../core/OpImageGather_TestBasic_lit.frag     |    2 +-
 .../OpImageGather_TestConstOffsets_lit.frag   |    8 +-
 ...pImageGather_TestDrefConstOffsets_lit.frag |    8 +-
 .../OpImageGather_TestIntegerSampler.frag     |   24 +-
 .../core/OpImageGather_TestOffset_lit.frag    |    2 +-
 ...geGather_TestTextureGatherBiasLod_lit.frag |   40 +-
 ...ageGather_TestTextureGatherOffset_lit.frag |    6 +-
 ...geGather_TestTextureGatherOffsets_lit.frag |   24 +-
 .../OpImageGather_TestTextureGather_lit.frag  |    6 +-
 ...geReadWrite_TestImageLoadStoreLod_lit.comp |   26 +-
 .../core/OpImageRead_Test2DMS_lit.comp        |    4 +-
 .../core/OpImageRead_TestBasic_lit.comp       |   12 +-
 .../core/OpImageRead_TestCube_lit.comp        |    2 +-
 .../core/OpImageRead_TestImageLoad_lit.frag   |    8 +-
 .../OpImageRead_TestInt64ImageLoad.spvasm     |    2 +-
 .../core/OpImageRead_TestIntImage_lit.comp    |    2 +-
 .../OpImageRead_TestMemoryQualifier_lit.comp  |    8 +-
 .../OpImageRead_TestNonVec4Data_lit.spvasm    |   18 +-
 .../OpImageRead_TestSubpassInput_lit.frag     |    6 +-
 .../OpImageSampleExplicitLod_TestLod_lit.frag |    2 +-
 ...eExplicitLod_TestTextureGradClamp_lit.frag |   10 +-
 ...ExplicitLod_TestTextureGradOffset_lit.frag |    6 +-
 ...SampleExplicitLod_TestTextureGrad_lit.frag |    6 +-
 ...eExplicitLod_TestTextureLodOffset_lit.frag |    6 +-
 ...eSampleExplicitLod_TestTextureLod_lit.frag |    6 +-
 ...mageSampleImplicitLod_Test1DArray_lit.frag |    2 +-
 ...mageSampleImplicitLod_Test2DArray_lit.frag |    2 +-
 ...ImageSampleImplicitLod_Test2DRect_lit.frag |    2 +-
 .../OpImageSampleImplicitLod_Test3D_lit.frag  |    2 +-
 ...ImplicitLod_TestArrayDirectAccess_lit.frag |    2 +-
 ...pImageSampleImplicitLod_TestBasic_lit.frag |    2 +-
 ...OpImageSampleImplicitLod_TestBias_lit.frag |    2 +-
 ...ageSampleImplicitLod_TestDrefGrad_lit.frag |    2 +-
 ...OpImageSampleImplicitLod_TestGrad_lit.frag |    2 +-
 ...pleImplicitLod_TestIntegerSampler_lit.frag |    4 +-
 ...Lod_TestMultiDimArrayDirectAccess_lit.frag |    2 +-
 ...ImageSampleImplicitLod_TestOffset_lit.frag |    2 +-
 ...mplicitLod_TestProjDrefGradOffset_lit.frag |    2 +-
 ...ageSampleImplicitLod_TestSeparate_lit.frag |    2 +-
 ...eImplicitLod_TestTextureBiasClamp_lit.frag |   16 +-
 ...ampleImplicitLod_TestTextureClamp_lit.frag |   10 +-
 ...eImplicitLod_TestTextureGradClamp_lit.frag |   12 +-
 ...citLod_TestTextureGradOffsetClamp_lit.frag |   12 +-
 ...mplicitLod_TestTextureOffsetClamp_lit.frag |   12 +-
 ...mpleImplicitLod_TestTextureOffset_lit.frag |    6 +-
 ...mageSampleImplicitLod_TestTexture_lit.frag |    6 +-
 ...SparseRead_TestInt64SparseImageLoad.spvasm |    2 +-
 .../OpImageWrite_TestInt64ImageStore.spvasm   |    2 +-
 ...ryBarrier_TestMemoryBarrierShared_lit.comp |    2 +-
 llpc/test/shaderdb/core/OpPtrEqualTest.spvasm |   11 +-
 ...peSampledImage_TestWaterfallInsertion.frag |    2 +-
 ...peSampledImage_TestWaterfallScalarize.frag |    6 +-
 ...age_TestWaterfallScalarize_MultiBlock.frag |   18 +-
 ...age_TestWaterfallScalarize_SharedDesc.frag |   12 +-
 .../TestEnableImplicitInvariantExports.vert   |    2 +-
 .../shaderdb/core/TestXfbStateMetadata.vert   |    2 +-
 .../debug_info/NonSemanticShaderDebug.pipe    |   86 +-
 ...PipelineGsTess_TestVsTesGsMergeShader.pipe | 1120 +++++++++
 .../PipelineGs_TestVsGSMergeShader.pipe       |  435 ++++
 .../ExtFragMask_TestFragFetch_lit.frag        |   12 +-
 llpc/test/shaderdb/general/CoherentArray.frag |   30 +
 .../test/shaderdb/general/CoherentVector.frag |   22 +
 llpc/test/shaderdb/general/ImgDescLoad.comp   |    2 +-
 .../general/OptimizePointSizeWrite.pipe       |   84 +
 ...ipelineCs_TestFetch2DMSFmaskBased_lit.pipe |    4 +-
 .../PipelineVsFs_ColorExportShader.pipe       |   40 +
 .../general/PipelineVsFs_GlPositionFMF.pipe   |    1 +
 .../shaderdb/general/TestWorkgroupIdOpt.comp  |    2 +-
 .../shaderdb/gfx11/AttributePrecedesPos.pipe  |    4 +-
 .../shaderdb/gfx11/SgprUserDataInit_Fs.pipe   |    4 +-
 .../cooperativeMatrix/array-of-matrices.comp  |   47 +-
 .../cooperativeMatrix/extract-insert.spvasm   |   43 +-
 .../cooperativeMatrix/loadstore-uvec4.comp    |    6 +-
 .../PipelineVsFs_TestGraphicsLibrary.pipe     |    4 +-
 .../hlsl/Hlsl_TestStructuredBuffers.spvasm    |   93 +
 .../object/ObjFragMask_TestFragFetch_lit.frag |    6 +-
 .../object/ObjInput_TestGsBuiltIn_lit.geom    |   24 +-
 .../object/ObjNonUniform_TestImageSample.frag |    5 +-
 ...Sampler_TestSeparateSamplerShadow_lit.frag |    2 +-
 .../ObjStorageBlock_TestRowMajor_lit.frag     |   48 +-
 .../ObjStorageBlock_TestRuntimeArray_lit.vert |    2 +-
 .../ray_tracing/PipelineRayquery.pipe         |   10 +-
 ...Continuations_SpecializeDriverShaders.pipe |  285 +++
 ...inuations_SpecializeDriverShaders_Isa.pipe |   40 +
 .../PipelineVsFs_EnableColorExport.pipe       |    2 +-
 .../PipelineVsFs_MultiDwordPushConst.pipe     |    2 +-
 llpc/tool/amdllpc.cpp                         |   21 +-
 llpc/tool/llpcCompilationUtils.cpp            |   13 +-
 llpc/tool/llpcCompilationUtils.h              |   27 +-
 llpc/tool/llpcComputePipelineBuilder.cpp      |   14 +-
 llpc/tool/llpcGraphicsPipelineBuilder.cpp     |   79 +-
 llpc/tool/llpcRayTracingPipelineBuilder.cpp   |    5 +-
 llpc/translator/lib/SPIRV/SPIRVReader.cpp     |  263 +-
 llpc/translator/lib/SPIRV/SPIRVReader.h       |   14 +-
 .../lib/SPIRV/SPIRVToLLVMDbgTran.cpp          |   10 +-
 .../lib/SPIRV/libSPIRV/SPIRV.debug.h          |   13 +-
 .../translator/lib/SPIRV/libSPIRV/SPIRVEnum.h |    2 +
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     |   18 +-
 .../lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h     |    4 +-
 .../lib/SPIRV/libSPIRV/SPIRVModule.cpp        |    4 +-
 .../lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h     |    2 +
 llpc/unittests/context/testOptLevel.cpp       |    4 +-
 llpc/util/llpcShaderModuleHelper.cpp          |  455 ++--
 llpc/util/llpcShaderModuleHelper.h            |   14 +-
 llvmraytracing/CMakeLists.txt                 |    2 +-
 llvmraytracing/include/lgc/GpurtDialect.td    |    6 -
 llvmraytracing/include/lgc/LgcCpsDialect.td   |    3 +-
 .../include/llvmraytracing/Continuations.h    |   13 +-
 .../llvmraytracing/ContinuationsUtil.h        |   22 +-
 .../include/llvmraytracing/PipelineState.h    |    2 +
 .../llvmraytracing/SpecializeDriverShaders.h  |  177 ++
 llvmraytracing/lib/CleanupContinuations.cpp   |  313 ++-
 llvmraytracing/lib/Continuations.cpp          |   68 +-
 llvmraytracing/lib/CpsStackLowering.cpp       |    4 +
 .../lib/DXILContIntrinsicPrepare.cpp          |   40 +-
 llvmraytracing/lib/DXILContPostProcess.cpp    |  147 +-
 .../lib/LegacyCleanupContinuations.cpp        |  706 ------
 llvmraytracing/lib/LowerAwait.cpp             |   69 +-
 .../lib/LowerRaytracingPipeline.cpp           |  288 ++-
 llvmraytracing/lib/PassRegistry.inc           |   12 +-
 llvmraytracing/lib/PipelineState.cpp          |   16 +-
 .../lib/SpecializeDriverShaders.cpp           | 1321 ++++++++++
 .../test/dx/cleanup-continuations-malloc.ll   |   28 +-
 .../test/dx/cleanup-continuations.ll          |   38 +-
 .../test/dx/closest-hit-procedural.ll         |    6 +-
 .../test/dx/closest-hit-traceray.ll           |    6 +-
 llvmraytracing/test/dx/closest-hit.ll         |    4 +-
 .../test/dx/continuation-registercount.ll     |    6 +-
 .../test/dx/continuation-stacksize.ll         |    6 +-
 llvmraytracing/test/dx/continuation-state.ll  |   42 +-
 .../test/dx/continuation-without-await.ll     |   30 +-
 .../test/dx/dxil-cont-prepare-traversal.ll    |   20 +-
 .../test/dx/dxil-cps-stack-lowering-global.ll |    6 +-
 .../dx/dxil-cps-stack-lowering-scratch.ll     |    6 +-
 ...-raygen-cont-state-in-persistent-launch.ll |    2 +-
 llvmraytracing/test/dx/global-mem-stack.ll    |    4 +-
 .../test/dx/inline-const-jump-target.ll       |   10 +-
 .../test/dx/intersection-registercount.ll     |    4 +-
 llvmraytracing/test/dx/intrinsics/complete.ll |    5 +-
 .../cont-payload-registers-get-i32.ll         |    7 +-
 .../cont-payload-registers-i32-count.ll       |    5 +-
 .../cont-payload-registers-set-i32.ll         |    7 +-
 .../test/dx/intrinsics/cont-stack-alloc.ll    |    6 +-
 .../dx/intrinsics/get-current-func-addr.ll    |    2 +
 .../test/dx/intrinsics/get-flags.ll           |   24 -
 .../test/dx/intrinsics/get-shader-kind.ll     |    3 +-
 .../test/dx/intrinsics/get-shader-rec-idx.ll  |   95 +
 .../test/dx/intrinsics/shader-index.ll        |    4 +-
 llvmraytracing/test/dx/lower-await.ll         |  174 +-
 .../test/dx/lower-rt-pipeline-call-shader.ll  |   17 +-
 .../test/dx/lower-rt-pipeline-exit-raygen.ll  |    2 +
 .../dx/lower-rt-pipeline-intrinsics-hit.ll    |   31 +-
 .../test/dx/lower-rt-pipeline-intrinsics.ll   |   66 +-
 .../dx/lower-rt-pipeline-large-payload.ll     |  107 +-
 .../lower-rt-pipeline-simple-call-shader.ll   |   71 +-
 .../lower-rt-pipeline-small-payload-field.ll  |    6 +-
 llvmraytracing/test/dx/lower-rt-pipeline.ll   |  302 +--
 .../test/dx/paq-hit-attribute-size.ll         |   22 +-
 .../test/dx/payload-caller-in-paq.ll          |   10 +-
 .../test/dx/payload-save-registers.ll         |   30 +-
 llvmraytracing/test/dx/payload.ll             | 2159 +++++++++++------
 llvmraytracing/test/dx/remat-intrinsic.ll     |    4 +-
 .../test/dx/remove-unused-declarations.ll     |    2 +-
 .../dx/specialize-driver-shaders/analysis.ll  |  483 ++++
 .../lower-rt-pipeline-args.ll                 |  467 ++++
 .../specialization.ll                         |  104 +
 llvmraytracing/test/dx/stats-report-sizes.ll  |    9 +-
 llvmraytracing/test/dx/traceray.ll            |  258 +-
 .../test/dx/traversal-empty-payload.ll        |   11 +-
 .../test/dx/traversal-passthrough-payload.ll  |    9 +-
 .../test/dx/unnamed-type-intrinsics.ll        |    8 +-
 llvmraytracing/test/dx/wrong-system-data.ll   |    1 +
 .../intrinsics/get-func-addr-not-found.ll     |    2 +-
 .../test/intrinsics/get-func-addr.ll          |    1 +
 .../test/intrinsics/shader-start.ll           |    3 +-
 llvmraytracing/test/lgccps/alloca-select.ll   |   15 +-
 llvmraytracing/test/lgccps/await-if-else.ll   |   22 +-
 llvmraytracing/test/lgccps/await-if.ll        |   16 +-
 llvmraytracing/test/lgccps/await-in-loop.ll   |   16 +-
 .../test/lgccps/call-shader-i1-payload.ll     |   18 +-
 .../test/lgccps/cleanup-store-loads.ll        |   18 +-
 .../test/lgccps/entry-point-with-cps.ll       |   62 +-
 .../cont-payload-registers-get-i32.ll         |    5 +-
 .../cont-payload-registers-i32-count.ll       |    5 +-
 .../cont-payload-registers-set-i32.ll         |    5 +-
 llvmraytracing/test/lgccps/lower-traversal.ll |   19 +-
 llvmraytracing/test/lgccps/multiple-await.ll  |   20 +-
 .../test/lgccps/simple-await-more-state.ll    |   14 +-
 llvmraytracing/test/lgccps/simple-await.ll    |   19 +-
 tool/dumper/vkgcPipelineDumper.cpp            |    6 +-
 tool/vfx/vfxVkSection.h                       |    1 +
 util/extensions.txt                           |    1 +
 util/gpurtshim/GpurtShim.cpp                  |    2 -
 util/vkgcExtension.cpp                        |    1 +
 util/vkgcExtension.h                          |    1 +
 version/CMakeLists.txt                        |   60 +-
 version/include/llpc/GpurtIntrinsics.h        |    7 +-
 version/include/llpcVersion.h.in              |    3 +-
 400 files changed, 16539 insertions(+), 5242 deletions(-)
 create mode 100644 cmake/LlvmMainRevision.cmake
 create mode 100644 compilerutils/include/compilerutils/DxilUtils.h
 create mode 100644 compilerutils/test/cross-module-inliner/inc/link-struct-ptr-argument.ll
 create mode 100644 compilerutils/test/cross-module-inliner/link-struct-ptr-argument.ll
 create mode 100644 compilerutils/test/value-origin-tracking/freeze-mode.ll
 rename lgc/include/lgc/patch/{PatchWorkarounds.h => ApplyWorkarounds.h} (99%)
 rename lgc/include/lgc/patch/{PatchResourceCollect.h => CollectResourceUsage.h} (99%)
 rename lgc/include/lgc/patch/{FragColorExport.h => FragmentColorExport.h} (99%)
 rename lgc/include/lgc/patch/{PatchBufferOp.h => LowerBufferOperations.h} (97%)
 rename lgc/include/lgc/patch/{PatchReadFirstLane.h => LowerReadFirstLane.h} (98%)
 rename lgc/include/lgc/patch/{TcsPassthroughShader.h => PassthroughHullShader.h} (98%)
 rename lgc/include/lgc/patch/{PatchPeepholeOpt.h => PeepholeOptimization.h} (98%)
 rename lgc/include/lgc/patch/{PatchPreparePipelineAbi.h => PreparePipelineAbi.h} (99%)
 rename lgc/include/lgc/patch/{PatchSetupTargetFeatures.h => SetupTargetFeatures.h} (98%)
 create mode 100644 lgc/include/lgc/patch/StructurizeBuffers.h
 create mode 100644 lgc/include/lgc/util/MsgPackScanner.h
 create mode 100644 lgc/interface/lgc/RegStackUsage.h
 rename lgc/patch/{PatchWorkarounds.cpp => ApplyWorkarounds.cpp} (99%)
 rename lgc/patch/{PatchResourceCollect.cpp => CollectResourceUsage.cpp} (97%)
 rename lgc/patch/{FragColorExport.cpp => FragmentColorExport.cpp} (99%)
 rename lgc/patch/{PatchNullFragShader.cpp => GenerateNullFragmentShader.cpp} (97%)
 rename lgc/patch/{PatchNullFragShader.h => GenerateNullFragmentShader.h} (98%)
 rename lgc/patch/{PatchBufferOp.cpp => LowerBufferOperations.cpp} (92%)
 rename lgc/patch/{PatchReadFirstLane.cpp => LowerReadFirstLane.cpp} (96%)
 rename lgc/patch/{TcsPassthroughShader.cpp => PassthroughHullShader.cpp} (99%)
 rename lgc/patch/{PatchPeepholeOpt.cpp => PeepholeOptimization.cpp} (98%)
 rename lgc/patch/{PatchPreparePipelineAbi.cpp => PreparePipelineAbi.cpp} (99%)
 rename lgc/patch/{PatchSetupTargetFeatures.cpp => SetupTargetFeatures.cpp} (99%)
 create mode 100644 lgc/patch/StructurizeBuffers.cpp
 create mode 100644 lgc/test/CleanUndefOutputValues.lgc
 create mode 100644 lgc/test/Transforms/PatchBufferOp/buffer-index-op.lgc
 create mode 100644 lgc/test/Transforms/PatchBufferOp/buffer.atomic.ops.lgc
 create mode 100644 lgc/unittests/internal/CMakeLists.txt
 create mode 100644 lgc/unittests/internal/MsgPackScannerTest.cpp
 create mode 100644 lgc/util/MsgPackScanner.cpp
 create mode 100644 lgc/util/RegStackUsage.cpp
 create mode 100644 llpc/context/llpcDialect.cpp
 create mode 100644 llpc/include/LlpcDialect.td
 create mode 100644 llpc/test/shaderdb/core/OpExtInst_PackHalf2x16.comp
 create mode 100644 llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe
 create mode 100644 llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe
 create mode 100644 llpc/test/shaderdb/general/CoherentArray.frag
 create mode 100644 llpc/test/shaderdb/general/CoherentVector.frag
 create mode 100644 llpc/test/shaderdb/general/OptimizePointSizeWrite.pipe
 create mode 100644 llpc/test/shaderdb/general/PipelineVsFs_ColorExportShader.pipe
 create mode 100644 llpc/test/shaderdb/hlsl/Hlsl_TestStructuredBuffers.spvasm
 create mode 100644 llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe
 create mode 100644 llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders_Isa.pipe
 create mode 100644 llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h
 delete mode 100644 llvmraytracing/lib/LegacyCleanupContinuations.cpp
 create mode 100644 llvmraytracing/lib/SpecializeDriverShaders.cpp
 delete mode 100644 llvmraytracing/test/dx/intrinsics/get-flags.ll
 create mode 100644 llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll
 create mode 100644 llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll
 create mode 100644 llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll
 create mode 100644 llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1799619c34..96a039a1cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,7 +206,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
         if (TARGET llvm-dialects-example)
             set_property(TARGET llvm-dialects-example PROPERTY FOLDER Misc)
         endif()
-        if (LLVM_OPTIMIZED_TABLEGEN)
+        if (NOT WIN32 AND LLVM_OPTIMIZED_TABLEGEN)
+#if _WIN32
+            # These targets don't exist on Windows when CMake is first invoked.
+            # They are created later at build time, when the cross-compilation takes place.
+#endif
             set_property(TARGET llvm_nm_target PROPERTY FOLDER Misc)
             set_property(TARGET llvm_readobj_target PROPERTY FOLDER Misc)
             set_property(TARGET llvm-min-tblgen-host PROPERTY FOLDER Misc)
@@ -215,9 +219,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
             set_property(TARGET CREATE_LLVM_NATIVE PROPERTY FOLDER Misc)
         endif()
 #if _WIN32
-        if (MSVC)
-            # We can't use LLVM_OPTIMIZED_TABLEGEN on Windows, and the 32-bit llvm-tblgen can easily
-            # to run out of memory. Tell the linker to allow addresses larger than 2GB.
+        if(MSVC)
+            # The 32-bit llvm-tblgen can easily run out of memory. Tell the linker to allow addresses larger than 2GB.
             set_property(TARGET llvm-tblgen PROPERTY LINK_FLAGS "/LARGEADDRESSAWARE")
         endif()
 #endif
diff --git a/cmake/LlvmMainRevision.cmake b/cmake/LlvmMainRevision.cmake
new file mode 100644
index 0000000000..d2800a724d
--- /dev/null
+++ b/cmake/LlvmMainRevision.cmake
@@ -0,0 +1,44 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
+# Include this file to set LLVM_MAIN_REVISION, for when it is needed at cmake level rather than C++ level.
+
+if (NOT LLVM_MAIN_REVISION)
+  # A sneaky way to get the LLVM source directory, assuming we are included from a LLVM external
+  # project such as LGC or LLPCFE.
+  get_filename_component(LLVM_SOURCE_DIR "${CPACK_RESOURCE_FILE_LICENSE}" DIRECTORY)
+  if (NOT LLVM_SOURCE_DIR)
+      message(FATAL_ERROR "LLVM_SOURCE_DIR not found")
+  endif()
+
+  # Scrape LLVM_MAIN_REVISION out of llvm-config.h.cmake. If not found, set to a high number.
+  set(LLVM_CONFIG_H_NAME "${LLVM_SOURCE_DIR}/include/llvm/Config/llvm-config.h.cmake")
+  file(READ "${LLVM_CONFIG_H_NAME}" LLVM_CONFIG_H_CONTENTS)
+  string(REGEX REPLACE "^.* LLVM_MAIN_REVISION ([0-9]+).*$" "\\1" LLVM_MAIN_REVISION "${LLVM_CONFIG_H_CONTENTS}")
+  if ("${LLVM_MAIN_REVISION}" STREQUAL "${LLVM_CONFIG_H_CONTENTS}")
+      set(LLVM_MAIN_REVISION 999999999)
+  endif()
+endif()
+
diff --git a/compilerutils/include/compilerutils/CompilerUtils.h b/compilerutils/include/compilerutils/CompilerUtils.h
index 207df2eef5..200bbd1053 100644
--- a/compilerutils/include/compilerutils/CompilerUtils.h
+++ b/compilerutils/include/compilerutils/CompilerUtils.h
@@ -92,7 +92,20 @@ struct CrossModuleInlinerResult {
 // One CrossModuleInliner instance must only be used for a single target module, otherwise things can go wrong.
 class CrossModuleInliner {
 public:
-  CrossModuleInliner() = default;
+  // Callback passed to getGlobalInModule, that tries to find an existing GlobalValue in the target module or copies it
+  // to the target module.
+  using GetGlobalInModuleTy = std::function<llvm::GlobalValue &(CrossModuleInliner &inliner,
+                                                                llvm::GlobalValue &sourceGV, llvm::Module &targetGv)>;
+
+  CrossModuleInliner(GetGlobalInModuleTy getGlobalInModuleCallback = defaultGetGlobalInModuleFunc);
+
+  // Do not allow copy but allow moving
+  CrossModuleInliner(const CrossModuleInliner &) = delete;
+  CrossModuleInliner(CrossModuleInliner &&);
+  CrossModuleInliner &operator=(const CrossModuleInliner &) = delete;
+  CrossModuleInliner &operator=(CrossModuleInliner &&);
+
+  ~CrossModuleInliner() noexcept;
 
   // Inline a call to a function even if the called function is in a different module.
   // If the result of that function call should be used, a use must exist before calling this function.
@@ -118,19 +131,21 @@ class CrossModuleInliner {
   // target module.
   llvm::GlobalValue *findCopiedGlobal(llvm::GlobalValue &sourceGv, llvm::Module &targetModule);
 
+  // Default implementation that finds global values using getCrossModuleName.
+  static llvm::GlobalValue &defaultGetGlobalInModuleFunc(CrossModuleInliner &inliner, llvm::GlobalValue &sourceGv,
+                                                         llvm::Module &targetModule);
+
   static std::string getCrossModuleName(llvm::GlobalValue &gv);
 
 private:
   // Checks that we haven't processed a different target module earlier.
-  void checkTargetModule(llvm::Module &targetModule) {
-    if (lastUsedTargetModule == nullptr)
-      lastUsedTargetModule = &targetModule;
-    else
-      assert(lastUsedTargetModule == &targetModule);
-  }
+  void checkTargetModule(llvm::Module &targetModule);
+
+  struct Impl;
+  class CrossModuleValueMaterializer;
 
-  llvm::SmallDenseMap<llvm::GlobalValue *, llvm::GlobalValue *> mappedGlobals;
-  llvm::Module *lastUsedTargetModule = nullptr; // used to check that we don't use different target modules
+  // Split into Impl class, so we don’t need to include everything in this header.
+  std::unique_ptr<Impl> impl;
 };
 
 // Essentially RAUW for pointers for the case that these use different address
@@ -140,6 +155,13 @@ class CrossModuleInliner {
 // The caller has to handle the erasure afterwards.
 void replaceAllPointerUses(llvm::IRBuilder<> *builder, llvm::Value *oldPointerValue, llvm::Value *newPointerValue,
                            llvm::SmallVectorImpl<llvm::Instruction *> &toBeRemoved);
+
+// Create a GEP if idx is non-null, otherwise return the pointer.
+llvm::Value *simplifyingCreateConstGEP1_32(llvm::IRBuilder<> &builder, llvm::Type *ty, llvm::Value *ptr, uint32_t idx);
+
+// Create an inbounds GEP if idx is non-null, otherwise return the pointer.
+llvm::Value *simplifyingCreateConstInBoundsGEP1_32(llvm::IRBuilder<> &builder, llvm::Type *ty, llvm::Value *ptr,
+                                                   uint32_t idx);
 } // namespace CompilerUtils
 
 namespace llvm {
diff --git a/compilerutils/include/compilerutils/DxilUtils.h b/compilerutils/include/compilerutils/DxilUtils.h
new file mode 100644
index 0000000000..837859ebf6
--- /dev/null
+++ b/compilerutils/include/compilerutils/DxilUtils.h
@@ -0,0 +1,64 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- DxilUtils.h -  --------------------------------------------------------------------------------------------===//
+//
+// Shared DXIl-related helpers.
+//
+//===--------------------------------------------------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/StringRef.h"
+
+namespace CompilerUtils::dxil {
+
+// Try to demangle function names in the DXIL format:
+// ...\01?FuncName@@...
+// @param funcName : Name of the callee
+// @returns: the original string if the name was not demangleable or the demangled function name.
+inline llvm::StringRef tryDemangleFunctionName(llvm::StringRef inputName) {
+  assert(!inputName.empty());
+
+  constexpr static llvm::StringRef manglingPrefix = "\01?";
+
+  // Expect both characters to be there, and `\01?` to occur before `@@`
+  size_t start = inputName.find(manglingPrefix);
+  if (start == llvm::StringRef::npos)
+    return inputName;
+
+  // The case start >= end is implicitly checked by the second call to `find`.
+  const size_t end = inputName.find("@@", start);
+  if (end == llvm::StringRef::npos)
+    return inputName;
+
+  start += manglingPrefix.size();
+
+  // Extract unmangled name: Return everything after the first occurrence of `\01?` and before the first occurrence of
+  // `@@` after `?`.
+  return inputName.substr(start, end - start);
+}
+
+} // namespace CompilerUtils::dxil
diff --git a/compilerutils/include/compilerutils/ValueOriginTracking.h b/compilerutils/include/compilerutils/ValueOriginTracking.h
index 6e3d9215d4..dbc5135ce7 100644
--- a/compilerutils/include/compilerutils/ValueOriginTracking.h
+++ b/compilerutils/include/compilerutils/ValueOriginTracking.h
@@ -106,9 +106,12 @@ namespace ValueTracking {
 struct SliceStatus {
   // As the actual enum is contained within the struct, its values don't leak into the containing namespace,
   // and it's not possible to implicitly cast a SliceStatus to an int, so it's as good as an enum class.
+  // The UndefOrPoison case always originates from a `poison` or `undef` value.
+  // We must be careful with freeze instructions operating on such values, see FreezeHandlingMode.
   enum StatusEnum : uint32_t { Constant = 0x1, Dynamic = 0x2, UndefOrPoison = 0x4 };
   StatusEnum S = {};
 
+  // Intentionally allow implicit conversion:
   SliceStatus(StatusEnum S) : S{S} {}
 
   static SliceStatus makeEmpty() { return static_cast<StatusEnum>(0); }
@@ -188,6 +191,45 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const ValueInfo &VI);
 // constant and then always propagated, allowing to replace the argument by the initial constant.
 class ValueOriginTracker {
 public:
+  // Configuration options for ValueOriginTracker.
+  struct Options {
+    unsigned BytesPerSlice = 4;
+    unsigned MaxBytesPerValue = 512;
+
+    // Freeze instructions are problematic for value origin tracking.
+    //
+    // While `freeze poison` are intended to help optimization by allowing it to pick any value, we cannot just
+    // treat `freeze poison` as UndefOrPoison, because an optimization relying on that would need to ensure
+    // other users of the optimized `freeze poison` observe the same value picked by optimization, and value origin
+    // tracking does not allow to query which `freeze poison` instructions a particular slice originates from.
+    // Instead, the only safe way to treat `freeze poison` is dynamic.
+    //
+    // In some cases, e.g. when not optimizing based on the analysis result, and instead just using it for sanity
+    // checking in testing, treating `freeze poison` as UndefOrPoison however is the intended result, and if
+    // value origin tracking implicitly considered all `freeze poison` as dynamic, then client code would need to
+    // propagate the intended UndefOrPoison semantics manually.
+    //
+    // The FreezeHandlingMode enum allows to avoid that, allowing the client to specify how `freeze poison` and
+    // `freeze undef` should be handled.
+    //
+    // If we want to optimize based on `freeze poison`, one option would be eliminating all freeze instructions by some
+    // constant (e.g. `zeroinitializer`) before running the analysis, as some LLVM transforms like instcombine do.
+    // This ensures that not only the analysis sees a common constant value for `freeze poison`, but also ensures other
+    // uses of `freeze poison` observe the same value.
+    //
+    // As a less conservative potential future improvement, we could instead explicitly keep track of FrozenPoison
+    // slices in value origin tracking, and when merging FrozenPoison with constants, recording which `freeze poison`
+    // values need to be replaced by which constants to allow that.
+    enum class FreezeHandlingMode {
+      // Treat slices in freeze instructions that are UndefOrPoison in the freeze operand as dynamic.
+      Dynamic = 0,
+      // Always forward value infos of freeze operands for freeze instructions.
+      // In particular, `freeze poison` is always reported as UndefOrPoison.
+      Forward
+    };
+    FreezeHandlingMode FreezeMode = FreezeHandlingMode::Dynamic;
+  };
+
   using ValueInfo = ValueTracking::ValueInfo;
   // In some cases, client code has additional information on where values originate from, or
   // where they should be assumed to originate from just for the purpose of the analysis.
@@ -225,10 +267,9 @@ class ValueOriginTracker {
   // Also, only a single status on assumptions is allowed.
   using ValueOriginAssumptions = llvm::DenseMap<llvm::Instruction *, ValueInfo>;
 
-  ValueOriginTracker(const llvm::DataLayout &DL, unsigned BytesPerSlice = 4, unsigned MaxBytesPerValue = 512,
+  ValueOriginTracker(const llvm::DataLayout &DL, Options Opts,
                      ValueOriginAssumptions OriginAssumptions = ValueOriginAssumptions{})
-      : DL{DL}, BytesPerSlice{BytesPerSlice}, MaxBytesPerValue{MaxBytesPerValue},
-        OriginAssumptions(std::move(OriginAssumptions)) {}
+      : DL{DL}, Opts{Opts}, OriginAssumptions(std::move(OriginAssumptions)) {}
 
   // Computes a value info for the given value.
   // If the value has been seen before, returns a cache hit from the ValueInfos map.
@@ -247,8 +288,7 @@ class ValueOriginTracker {
 private:
   struct ValueInfoBuilder;
   const llvm::DataLayout &DL;
-  unsigned BytesPerSlice = 0;
-  unsigned MaxBytesPerValue = 0;
+  Options Opts;
   ValueOriginAssumptions OriginAssumptions;
   llvm::DenseMap<llvm::Value *, ValueInfo> ValueInfos;
 
diff --git a/compilerutils/lib/CompilerUtils.cpp b/compilerutils/lib/CompilerUtils.cpp
index 80ba2b0d2b..8590a6fe90 100644
--- a/compilerutils/lib/CompilerUtils.cpp
+++ b/compilerutils/lib/CompilerUtils.cpp
@@ -44,6 +44,7 @@
 #define DEBUG_TYPE "compilerutils"
 
 using namespace llvm;
+using namespace CompilerUtils;
 
 // Whether this is a load instruction that should translate to a last_use
 // load.
@@ -164,74 +165,104 @@ void CompilerUtils::setIsLastUseLoad(llvm::LoadInst &Load) {
 
 namespace {
 
-class CrossModuleValueMaterializer : public ValueMaterializer {
-public:
-  CrossModuleValueMaterializer(Module *targetMod, CompilerUtils::CrossModuleInliner &inliner,
-                               SmallDenseMap<GlobalValue *, GlobalValue *> &mapped)
-      : targetMod(targetMod), inliner(&inliner), mapped(&mapped) {}
-  virtual ~CrossModuleValueMaterializer() = default;
-
-  void setMapper(ValueMapper *mapper) { this->mapper = mapper; }
+// Map Types from source to target module.
+struct CrossModuleTypeRemapper : public ValueMapTypeRemapper {
+  CrossModuleTypeRemapper() = default;
 
-  virtual Value *materialize(Value *v) override {
-    if (auto *gv = dyn_cast<GlobalValue>(v)) {
-      if (gv->getParent() == targetMod)
-        return nullptr;
-
-      auto *newGv = moveGlobalValueToNewModule(gv);
-      return newGv;
+  Type *remapType(Type *SrcTy) override {
+    if (auto found = mappedTypes.find(SrcTy); found != mappedTypes.end()) {
+      return found->second;
     }
-    return nullptr;
+    return SrcTy;
   }
 
-private:
-  GlobalValue *moveGlobalValueToNewModule(GlobalValue *gv) {
-    if (auto *existing = inliner->findCopiedGlobal(*gv, *targetMod))
-      return existing;
-
-    auto newName = CompilerUtils::CrossModuleInliner::getCrossModuleName(*gv);
-    if (auto *callee = dyn_cast<Function>(gv)) {
-      if (!callee->isDeclaration()) {
-        report_fatal_error(
-            Twine("Cross module inlining does not support functions with calls to functions with a body. "
-                  "Run the inliner before trying to inline across modules (trying to call '") +
-            callee->getName() + "')");
-      }
+  DenseMap<Type *, Type *> mappedTypes;
+};
 
-      // Create a function declaration
-      auto *newGv =
-          CompilerUtils::cloneFunctionHeader(*callee, callee->getFunctionType(), callee->getAttributes(), targetMod);
-      newGv->setName(newName);
+} // anonymous namespace
 
-      (*mapped)[gv] = newGv;
-      return newGv;
-    }
+class CrossModuleInliner::CrossModuleValueMaterializer : public ValueMaterializer {
+public:
+  CrossModuleValueMaterializer(CrossModuleInliner &inliner) : inliner(&inliner) {}
+  virtual ~CrossModuleValueMaterializer() = default;
 
-    if (auto *gVar = dyn_cast<GlobalVariable>(gv)) {
-      // Create a global with the correct type
-      auto *newGv = new GlobalVariable(*targetMod, gVar->getValueType(), gVar->isConstant(), gVar->getLinkage(),
-                                       nullptr, newName, nullptr, gVar->getThreadLocalMode(), gVar->getAddressSpace());
-      newGv->copyAttributesFrom(gVar);
-      if (gVar->hasInitializer()) {
-        // Recursively map initializer
-        auto *newInit = mapper->mapConstant(*gVar->getInitializer());
-        newGv->setInitializer(newInit);
-      }
+  virtual Value *materialize(Value *v) override;
 
-      (*mapped)[gv] = newGv;
-      return newGv;
+  CrossModuleInliner *inliner;
+};
+
+struct CrossModuleInliner::Impl {
+  Impl(CrossModuleInliner &inliner, GetGlobalInModuleTy getGlobalInModuleFunc)
+      : materializer(inliner), mapper(map, RF_IgnoreMissingLocals, &typeRemapper, &materializer),
+        getGlobalInModuleFunc(std::move(getGlobalInModuleFunc)) {}
+
+  CrossModuleTypeRemapper typeRemapper;
+  CrossModuleValueMaterializer materializer;
+  llvm::ValueToValueMapTy map;
+  llvm::ValueMapper mapper;
+  GetGlobalInModuleTy getGlobalInModuleFunc;
+  llvm::Module *targetMod = nullptr;
+};
+
+Value *CrossModuleInliner::CrossModuleValueMaterializer::materialize(Value *v) {
+  if (auto *gv = dyn_cast<GlobalValue>(v)) {
+    if (gv->getParent() == inliner->impl->targetMod)
+      return nullptr;
+
+    GlobalValue *newGv = inliner->findCopiedGlobal(*gv, *inliner->impl->targetMod);
+    if (!newGv)
+      newGv = &inliner->impl->getGlobalInModuleFunc(*inliner, *gv, *inliner->impl->targetMod);
+
+    // Insert into mappedTypes if there is no entry yet.
+    // Ensure recorded type mappings are consistent.
+    auto &mappedTypes = inliner->impl->typeRemapper.mappedTypes;
+    auto InsertToMappedTypes = [&mappedTypes](Type *sourceType, Type *copiedType) {
+      assert((sourceType != nullptr) && (copiedType != nullptr));
+      if (sourceType != copiedType) {
+        auto found = mappedTypes.insert(std::make_pair(sourceType, copiedType));
+        assert((found.second || copiedType == found.first->second) && "Inconsistent type mapping");
+      }
+    };
+    if (isa<GlobalVariable>(newGv)) {
+      Type *sourceType = gv->getValueType();
+      Type *copiedType = newGv->getValueType();
+      InsertToMappedTypes(sourceType, copiedType);
+    } else if (auto *func = dyn_cast<Function>(gv)) {
+      // Map type for function arguments and return.
+      FunctionType *sourceFuncTy = dyn_cast<FunctionType>(func->getFunctionType());
+      FunctionType *copiedFuncTy = dyn_cast<FunctionType>(cast<Function>(newGv)->getFunctionType());
+      for (unsigned index = 0; index < sourceFuncTy->getNumParams(); ++index) {
+        Type *sourceArgTy = sourceFuncTy->getParamType(index);
+        Type *copiedArgTy = copiedFuncTy->getParamType(index);
+        InsertToMappedTypes(sourceArgTy, copiedArgTy);
+      }
+      Type *sourceRetType = func->getReturnType();
+      Type *copiedRetType = copiedFuncTy->getReturnType();
+      InsertToMappedTypes(sourceRetType, copiedRetType);
     }
 
-    report_fatal_error("Encountered unknown global object while inlining");
+    return newGv;
   }
+  return nullptr;
+}
 
-  Module *targetMod;
-  CompilerUtils::CrossModuleInliner *inliner;
-  SmallDenseMap<GlobalValue *, GlobalValue *> *mapped;
-  ValueMapper *mapper;
-};
+CrossModuleInliner::CrossModuleInliner(GetGlobalInModuleTy getGlobalInModuleCallback)
+    : impl(std::make_unique<Impl>(*this, std::move(getGlobalInModuleCallback))) {
+}
 
-} // anonymous namespace
+CrossModuleInliner::CrossModuleInliner(CrossModuleInliner &&inliner) : impl(std::move(inliner.impl)) {
+  if (impl)
+    impl->materializer.inliner = this;
+}
+
+CrossModuleInliner &CrossModuleInliner::operator=(CrossModuleInliner &&inliner) {
+  impl = std::move(inliner.impl);
+  if (impl)
+    impl->materializer.inliner = this;
+  return *this;
+}
+
+CrossModuleInliner::~CrossModuleInliner() = default;
 
 iterator_range<Function::iterator> CompilerUtils::CrossModuleInliner::inlineCall(CallBase &cb) {
   auto *calleeFunc = cb.getCalledFunction();
@@ -242,10 +273,11 @@ iterator_range<Function::iterator> CompilerUtils::CrossModuleInliner::inlineCall
   Function *targetFunc = cb.getFunction();
   auto *targetMod = targetFunc->getParent();
   auto callBb = cb.getParent()->getIterator();
+
   auto callBbSuccessor = callBb;
   ++callBbSuccessor;
   const bool callBbHasSuccessor = callBbSuccessor != targetFunc->end();
-  const size_t bbCount = targetFunc->size();
+  [[maybe_unused]] const size_t bbCount = targetFunc->size();
   // Save uses of the return value
   SmallVector<Value *> users(cb.users());
 
@@ -291,10 +323,6 @@ iterator_range<Function::iterator> CompilerUtils::CrossModuleInliner::inlineCall
     assert(!calleeFunc->getParent()->getName().empty() && "Can only inline from modules that have a name");
 
     // Look for references to global values and replace them with global values in the new module
-    CrossModuleValueMaterializer materializer{targetMod, *this, mappedGlobals};
-    ValueToValueMapTy map;
-    ValueMapper mapper{map, RF_IgnoreMissingLocals, nullptr, &materializer};
-    materializer.setMapper(&mapper);
     for (auto bb = firstNewBb; bb != lastNewBb; bb++) {
       bool skipBeforeInsts = hasInstBefore && bb == firstNewBb;
       for (auto &i : *bb) {
@@ -307,7 +335,7 @@ iterator_range<Function::iterator> CompilerUtils::CrossModuleInliner::inlineCall
         if (hasInstAfter && &i == &*instAfter)
           break;
 
-        mapper.remapInstruction(i);
+        impl->mapper.remapInstruction(i);
       }
       assert((bb != firstNewBb || !hasInstBefore || !skipBeforeInsts) && "Did not find first instruction");
     }
@@ -315,7 +343,7 @@ iterator_range<Function::iterator> CompilerUtils::CrossModuleInliner::inlineCall
     // If the inlined function returned a constant, that gets inlined into the users of the original value. Iterate over
     // these to catch all global values
     for (auto *u : users)
-      mapper.remapInstruction(*cast<Instruction>(u));
+      impl->mapper.remapInstruction(*cast<Instruction>(u));
   }
 
   return make_range(firstNewBb, lastNewBb);
@@ -361,28 +389,86 @@ CompilerUtils::CrossModuleInliner::inlineCall(IRBuilder<> &b, llvm::Function *ca
 }
 
 GlobalValue *CompilerUtils::CrossModuleInliner::findCopiedGlobal(GlobalValue &sourceGv, Module &targetModule) {
-  assert(sourceGv.getParent() != &targetModule && "This function only finds copies across modules");
-  assert(sourceGv.hasName() && "Cannot find a global value that does not have a name");
   checkTargetModule(targetModule);
 
-  if (auto found = mappedGlobals.find(&sourceGv); found != mappedGlobals.end()) {
-    assert(found->second->getParent() == &targetModule &&
+  if (auto found = impl->map.find(&sourceGv); found != impl->map.end()) {
+    auto *global = cast<GlobalValue>(found->second);
+    assert(global->getParent() == &targetModule &&
            "The CrossModuleInliner can only be used with a single target module");
-    return found->second;
+    return global;
+  }
+
+  return nullptr;
+}
+
+llvm::GlobalValue &CrossModuleInliner::defaultGetGlobalInModuleFunc(CrossModuleInliner &inliner,
+                                                                    llvm::GlobalValue &sourceGv,
+                                                                    llvm::Module &targetModule) {
+  inliner.checkTargetModule(targetModule);
+  assert(inliner.impl && "Called GetGlobalInModule, but the inliner is currently not inlining anything");
+
+  // Try to find by name
+  if (auto *existing = targetModule.getNamedValue(CompilerUtils::CrossModuleInliner::getCrossModuleName(sourceGv)))
+    return *existing;
+
+  auto &mappedTypes = inliner.impl->typeRemapper.mappedTypes;
+  auto newName = getCrossModuleName(sourceGv);
+  if (auto *callee = dyn_cast<Function>(&sourceGv)) {
+    if (!callee->isDeclaration()) {
+      report_fatal_error(Twine("Cross module inlining does not support functions with calls to functions with a body. "
+                               "Run the inliner before trying to inline across modules (trying to call '") +
+                         callee->getName() + "')");
+    }
+
+    // FunctionType needs to be mapped outside of the ValueMaterializer to avoid failing when
+    // setting the function as an operand in the CallInst in remapInstruction.
+    FunctionType *sourceFuncTy = dyn_cast<FunctionType>(callee->getFunctionType());
+    SmallVector<Type *> params;
+    for (unsigned index = 0; index < sourceFuncTy->getNumParams(); ++index) {
+      Type *argTy = sourceFuncTy->getParamType(index);
+      Type *mappedTy = argTy;
+      if (auto found = mappedTypes.find(mappedTy); found != mappedTypes.end())
+        mappedTy = found->second;
+      params.push_back(mappedTy);
+    }
+
+    Type *returnTy = sourceFuncTy->getReturnType();
+    Type *mappedTy = returnTy;
+    if (auto found = mappedTypes.find(mappedTy); found != mappedTypes.end())
+      mappedTy = found->second;
+
+    // Create a function declaration
+    FunctionType *targetFuncTy = FunctionType::get(mappedTy, params, sourceFuncTy->isVarArg());
+    auto *newGv = CompilerUtils::cloneFunctionHeader(*callee, targetFuncTy, callee->getAttributes(), &targetModule);
+    newGv->setName(newName);
+    return *newGv;
+  }
+
+  if (auto *gVar = dyn_cast<GlobalVariable>(&sourceGv)) {
+    // Create a global with the correct type
+    Type *mappedTy = gVar->getValueType();
+    if (auto found = mappedTypes.find(mappedTy); found != mappedTypes.end())
+      mappedTy = found->second;
+    auto *newGv = new GlobalVariable(targetModule, mappedTy, gVar->isConstant(), gVar->getLinkage(), nullptr, newName,
+                                     nullptr, gVar->getThreadLocalMode(), gVar->getAddressSpace());
+    newGv->copyAttributesFrom(gVar);
+    if (gVar->hasInitializer()) {
+      // Recursively map initializer
+      auto *newInit = inliner.impl->mapper.mapConstant(*gVar->getInitializer());
+      newGv->setInitializer(newInit);
+    }
+    return *newGv;
   }
 
-  GlobalValue *gv = targetModule.getNamedValue(getCrossModuleName(sourceGv));
-  if (gv)
-    assert(gv->getValueType() == sourceGv.getValueType());
-  return gv;
+  report_fatal_error("Encountered unknown global object while inlining");
 }
 
 // Get the name of a global that is copied to a different module for inlining.
-std::string CompilerUtils::CrossModuleInliner::getCrossModuleName(GlobalValue &gv) {
+std::string CrossModuleInliner::getCrossModuleName(GlobalValue &gv) {
   if (auto *fn = dyn_cast<Function>(&gv)) {
     // Intrinsics should not be renamed since the IR verifier insists on a "correct" name mangling based on any
     // overloaded types. Lgc dialects also require exact name for similar reason.
-    if (fn->isIntrinsic() || fn->getName().starts_with("lgc."))
+    if (fn->isIntrinsic() || fn->getName().starts_with("lgc.") || fn->getName().starts_with("llpcfe."))
       return fn->getName().str();
   }
   return (Twine(gv.getName()) + ".cloned." + gv.getParent()->getName()).str();
@@ -398,6 +484,13 @@ PointerType *llvm::getWithSamePointeeType(PointerType *ptrTy, unsigned addressSp
 #endif
 }
 
+void CrossModuleInliner::checkTargetModule(llvm::Module &targetModule) {
+  if (impl->targetMod == nullptr)
+    impl->targetMod = &targetModule;
+  else
+    assert(impl->targetMod == &targetModule);
+}
+
 void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointerValue, Value *newPointerValue,
                                           SmallVectorImpl<Instruction *> &toBeRemoved) {
   // Note: The implementation explicitly supports typed pointers, which
@@ -456,6 +549,8 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe
     }
     case Instruction::Load:
     case Instruction::Store:
+    case Instruction::AtomicRMW:
+    case Instruction::AtomicCmpXchg:
       // No further processing needed for the users.
       continue;
     case Instruction::InsertValue:
@@ -549,6 +644,19 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe
 #endif
 }
 
+Value *CompilerUtils::simplifyingCreateConstGEP1_32(IRBuilder<> &builder, Type *ty, Value *ptr, uint32_t idx) {
+  // A GEP with a single zero index is redundant with opaque pointers
+  if (idx == 0)
+    return ptr;
+  return builder.CreateConstGEP1_32(ty, ptr, idx);
+}
+
+Value *CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(IRBuilder<> &builder, Type *ty, Value *ptr, uint32_t idx) {
+  if (idx == 0)
+    return ptr;
+  return builder.CreateConstInBoundsGEP1_32(ty, ptr, idx);
+}
+
 void CompilerUtils::RegisterPasses(llvm::PassBuilder &PB) {
 #define HANDLE_PASS(NAME, CREATE_PASS)                                                                                 \
   if (innerPipeline.empty() && name == NAME) {                                                                         \
diff --git a/compilerutils/lib/ValueOriginTracking.cpp b/compilerutils/lib/ValueOriginTracking.cpp
index a5f57e7d1f..3ed550f2c3 100644
--- a/compilerutils/lib/ValueOriginTracking.cpp
+++ b/compilerutils/lib/ValueOriginTracking.cpp
@@ -410,6 +410,26 @@ struct ValueOriginTracker::ValueInfoBuilder {
     assert(Result.Slices.size() == NumSlices);
     return Result;
   }
+
+  // Create a value info for a freeze instruction.
+  // For freeze, we must be careful to preserve freeze semantics on UndefOrPoison slices:
+  // In contrast to undef/poison, all uses of a freeze instruction are guaranteed to observe the same value.
+  ValueInfo createFreeze(const ValueInfo &FrozenValueInfo, Options::FreezeHandlingMode FreezeMode) const {
+    using Mode = Options::FreezeHandlingMode;
+    if (FreezeMode == Mode::Forward)
+      return FrozenValueInfo;
+
+    assert(FreezeMode == Mode::Dynamic);
+
+    ValueInfo Result = FrozenValueInfo;
+    for (unsigned SliceIdx = 0; SliceIdx < Result.Slices.size(); ++SliceIdx) {
+      SliceInfo &SI = Result.Slices[SliceIdx];
+      if (SI.Status.contains(SliceStatus::UndefOrPoison))
+        SI = getDynamicSlice(SliceIdx);
+    }
+
+    return Result;
+  }
 };
 
 // Implement status printing also here, because for multi-bit status we want to interleave the printing
@@ -485,7 +505,7 @@ ValueInfo ValueOriginTracker::computeConstantValueInfo(ValueInfoBuilder &VIB, ll
     return VIB.createDynamic();
 
   auto Ty = CV->getType();
-  unsigned BitsPerSlice = 8 * BytesPerSlice;
+  unsigned BitsPerSlice = 8 * Opts.BytesPerSlice;
   // Don't bother with dynamic vectors
   auto *VectorTy = dyn_cast<FixedVectorType>(Ty);
   auto *ArrayTy = dyn_cast<ArrayType>(Ty);
@@ -588,7 +608,7 @@ ValueInfo ValueOriginTracker::computeValueInfoFromAssumption(ValueInfoBuilder &V
 // treating dependencies on earlier loop iterations as dynamic. Thus, for PHI nodes, if dependencies have not yet
 // been analyzed, we assume loop dependencies and give up.
 ValueInfo ValueOriginTracker::computeValueInfo(llvm::Value *V) {
-  ValueInfoBuilder VIB{DL, V, BytesPerSlice, MaxBytesPerValue};
+  ValueInfoBuilder VIB{DL, V, Opts.BytesPerSlice, Opts.MaxBytesPerValue};
   if (isa<UndefValue>(V)) {
     return VIB.createUndef();
   }
@@ -610,14 +630,19 @@ ValueInfo ValueOriginTracker::computeValueInfo(llvm::Value *V) {
 
   switch (Inst->getOpcode()) {
   case Instruction::AddrSpaceCast:
-  case Instruction::BitCast:
-  case Instruction::Freeze: {
-    // Just forward the operand for size-preserving type conversions and freeze
+  case Instruction::BitCast: {
+    // Just forward the operand for size-preserving type conversions
     auto *Op = Inst->getOperand(0);
     auto It = ValueInfos.find(Op);
     assert(It != ValueInfos.end());
     return It->second;
   }
+  case Instruction::Freeze: {
+    auto *Op = Inst->getOperand(0);
+    auto It = ValueInfos.find(Op);
+    assert(It != ValueInfos.end());
+    return VIB.createFreeze(It->second, Opts.FreezeMode);
+  }
   case Instruction::ExtractElement: {
     auto *EE = cast<ExtractElementInst>(Inst);
     auto *Vec = EE->getVectorOperand();
diff --git a/compilerutils/lib/ValueOriginTrackingTestPass.cpp b/compilerutils/lib/ValueOriginTrackingTestPass.cpp
index ff682f5d2c..0b957f9b96 100644
--- a/compilerutils/lib/ValueOriginTrackingTestPass.cpp
+++ b/compilerutils/lib/ValueOriginTrackingTestPass.cpp
@@ -39,11 +39,16 @@ namespace {
 
 cl::opt<unsigned> BytesPerSliceOption("value-origin-tracking-test-bytes-per-slice", cl::init(4));
 cl::opt<unsigned> MaxBytesPerValueOption("value-origin-tracking-test-max-bytes-per-value", cl::init(512));
+cl::opt<unsigned>
+    FreezeModeOption("value-origin-tracking-test-freeze-mode",
+                     cl::init(static_cast<unsigned>(ValueOriginTracker::Options::FreezeHandlingMode::Dynamic)));
 
 // Parse assumptions made via calls to the assume function.
 ValueOriginTracker::ValueOriginAssumptions parseAssumptions(Module &Module, Function &AssumeFunc) {
   ValueOriginTracker::ValueOriginAssumptions Result;
+  SmallVector<Instruction *> ToBeRemoved;
   forEachCall(AssumeFunc, [&](CallInst &AssumptionCall) {
+    ToBeRemoved.push_back(&AssumptionCall);
     unsigned NumArgs = AssumptionCall.arg_size();
     // We expect one arg for the value, and two per slice.
     if (NumArgs % 2 != 1)
@@ -82,6 +87,9 @@ ValueOriginTracker::ValueOriginAssumptions parseAssumptions(Module &Module, Func
     if (!Inserted)
       report_fatal_error("value with duplicate assumption");
   });
+  // Ensure assume calls are removed before starting the analysis, ensuring they don't impact it.
+  for (auto *Inst : ToBeRemoved)
+    Inst->eraseFromParent();
   return Result;
 }
 
@@ -101,32 +109,64 @@ llvm::PreservedAnalyses ValueOriginTrackingTestPass::run(llvm::Module &Module,
     Assumptions = parseAssumptions(Module, *AssumeFunc);
   }
 
-  ValueOriginTracker VOT{Module.getDataLayout(), BytesPerSliceOption.getValue(), MaxBytesPerValueOption.getValue(),
-                         Assumptions};
-
-  auto Prefix = "[VOT]: ";
+  ValueOriginTracker::Options Opts{};
+  Opts.FreezeMode = static_cast<ValueOriginTracker::Options::FreezeHandlingMode>(FreezeModeOption.getValue());
+  Opts.BytesPerSlice = BytesPerSliceOption.getValue();
+  Opts.MaxBytesPerValue = MaxBytesPerValueOption.getValue();
+  ValueOriginTracker VOT{Module.getDataLayout(), Opts, Assumptions};
 
   // Traverse all functions instead of the users of AnalyzeFunc to group output by function
+  // First collect values to be analyzed, then remove analyze calls, and then do the actual analysis.
+  // This ensures analysis calls don't interfere with the analysis, e.g. when freeze handling depends on the number of
+  // users.
+  SmallVector<Instruction *> ToBeRemoved;
+  struct AnalyzeCallsInfo {
+    SmallVector<Value *> Operands;
+  };
+  struct FunctionInfo {
+    Function *F;
+    SmallVector<AnalyzeCallsInfo> AnalyzeCalls;
+  };
+  SmallVector<FunctionInfo> ToBeAnalyzed;
+
   for (auto &F : Module) {
     if (F.isDeclaration())
       continue;
+    ToBeAnalyzed.push_back({});
+    auto &FuncInfo = ToBeAnalyzed.back();
+    FuncInfo.F = &F;
 
-    outs() << Prefix << F.getName() << "\n";
     for (auto &BB : F) {
       for (auto &I : BB) {
         auto *CI = dyn_cast<CallInst>(&I);
         if (!CI || CI->getCalledOperand() != AnalyzeFunc) {
           continue;
         }
+        ToBeRemoved.push_back(CI);
+        FuncInfo.AnalyzeCalls.push_back({});
+        auto &AnalyzeInfo = FuncInfo.AnalyzeCalls.back();
 
         for (Value *Op : CI->data_ops()) {
-          auto VI = VOT.getValueInfo(Op);
-          outs() << Prefix << "(" << *Op << "): " << VI << "\n";
+          AnalyzeInfo.Operands.push_back(Op);
         }
-        outs() << "\n";
       }
     }
   }
+
+  for (auto *Inst : ToBeRemoved)
+    Inst->eraseFromParent();
+
+  auto Prefix = "[VOT]: ";
+  for (const auto &FuncInfo : ToBeAnalyzed) {
+    outs() << Prefix << FuncInfo.F->getName() << "\n";
+    for (const auto &AnalyzeInfo : FuncInfo.AnalyzeCalls) {
+      for (Value *Op : AnalyzeInfo.Operands) {
+        auto VI = VOT.getValueInfo(Op);
+        outs() << Prefix << "(" << *Op << "): " << VI << "\n";
+      }
+      outs() << "\n";
+    }
+  }
   return PreservedAnalyses::all();
 }
 
diff --git a/compilerutils/lib/ValueSpecializationTestPass.cpp b/compilerutils/lib/ValueSpecializationTestPass.cpp
index 884ce1733a..ea4e4e4589 100644
--- a/compilerutils/lib/ValueSpecializationTestPass.cpp
+++ b/compilerutils/lib/ValueSpecializationTestPass.cpp
@@ -96,64 +96,84 @@ llvm::PreservedAnalyses ValueSpecializationTestPass::run(llvm::Module &Module,
   if (!SpecializeFunc)
     return PreservedAnalyses::all();
 
+  // First collect all specialization requests grouped by BB (because we re-use the specializer per BB),
+  // then remove all specialization intrinsic calls, then do the actual specialization.
+  struct BBInfo {
+    SmallVector<ValueSpecializationInfo> SpecializationRequests;
+  };
+  SmallVector<BBInfo> SpecializationRequestsByBB;
+
   SmallVector<CallInst *> ToBeDeleted;
+
   for (auto &F : Module) {
     for (auto &BB : F) {
-      // Use one specialize per BB, and re-use insertion points.
-      ValueSpecializer VS(Module);
-
+      SpecializationRequestsByBB.push_back({});
       for (auto &Inst : BB) {
         auto *CI = dyn_cast<CallInst>(&Inst);
         if (!CI || CI->getCalledOperand() != SpecializeFunc) {
           continue;
         }
         ToBeDeleted.push_back(CI);
+        SpecializationRequestsByBB.back().SpecializationRequests.push_back(parseSpecializeCall(*CI));
+      }
+    }
+  }
 
-        ValueSpecializationInfo VSI = parseSpecializeCall(*CI);
-        bool ReplaceUses = true;
-        bool PreserveInsertionPoint = true;
-        const auto [Replacement, NumReplacedDwords] =
-            VS.replaceDwords(VSI.Val, VSI.DwordInfos, ReplaceUses, PreserveInsertionPoint);
-
-        if (!(VSI.Flags & TestFlags::AllowFailure) && NumReplacedDwords != VSI.NumToBeReplacedDwords)
-          report_fatal_error("Less than expected replacements");
-        if (NumReplacedDwords != 0 && Replacement == nullptr)
-          report_fatal_error("Missing replacement result");
-
-        if (Replacement && !(VSI.Flags & TestFlags::SkipValueTrackingCheck)) {
-          // Run value tracking analysis on the replacement result, and check that it matches the requested replacements
-          ValueOriginTracker VOT{Module.getDataLayout(), 4, 256};
-          const ValueTracking::ValueInfo VI = VOT.getValueInfo(Replacement);
-          if (VI.Slices.size() != VSI.DwordInfos.size())
-            report_fatal_error("Size mismatch");
-          for (unsigned DwordIdx = 0; DwordIdx < VI.Slices.size(); ++DwordIdx) {
-            const ValueTracking::SliceInfo &SI = VI.Slices[DwordIdx];
-            const ValueSpecializer::DwordSpecializationInfo &DSI = VSI.DwordInfos[DwordIdx];
-            if (DSI.Kind == ValueSpecializer::SpecializationKind::Constant) {
-              if (SI.Status != ValueTracking::SliceStatus::Constant || SI.ConstantValue != DSI.ConstantValue)
-                report_fatal_error("Failed constant specialization");
-            }
-            if (DSI.Kind == ValueSpecializer::SpecializationKind::FrozenPoison) {
-              if (SI.Status != ValueTracking::SliceStatus::UndefOrPoison)
-                report_fatal_error("Failed frozen poison specialization");
+  for (auto *CI : ToBeDeleted)
+    CI->eraseFromParent();
+
+  for (const auto &BBInfo : SpecializationRequestsByBB) {
+    // Use one specializer per BB, and re-use insertion points.
+    ValueSpecializer VS(Module);
+
+    for (const auto &VSI : BBInfo.SpecializationRequests) {
+      bool ReplaceUses = true;
+      bool PreserveInsertionPoint = true;
+      const auto [Replacement, NumReplacedDwords] =
+          VS.replaceDwords(VSI.Val, VSI.DwordInfos, ReplaceUses, PreserveInsertionPoint);
+
+      if (!(VSI.Flags & TestFlags::AllowFailure) && NumReplacedDwords != VSI.NumToBeReplacedDwords)
+        report_fatal_error("Less than expected replacements");
+      if (NumReplacedDwords != 0 && Replacement == nullptr)
+        report_fatal_error("Missing replacement result");
+
+      if (Replacement && !(VSI.Flags & TestFlags::SkipValueTrackingCheck)) {
+        // Run value tracking analysis on the replacement result, and check that it matches the requested replacements
+        // Use Forward freeze handling mode. This is the most relaxed one and helps avoiding mismatches
+        // caused by conservative analysis of freeze.
+        ValueOriginTracker::Options Opts{};
+        Opts.FreezeMode = ValueOriginTracker::Options::FreezeHandlingMode::Forward;
+        Opts.MaxBytesPerValue = 256;
+        Opts.BytesPerSlice = 4;
+        ValueOriginTracker VOT{Module.getDataLayout(), Opts};
+        const ValueTracking::ValueInfo VI = VOT.getValueInfo(Replacement);
+        if (VI.Slices.size() != VSI.DwordInfos.size())
+          report_fatal_error("Size mismatch");
+        for (unsigned DwordIdx = 0; DwordIdx < VI.Slices.size(); ++DwordIdx) {
+          const ValueTracking::SliceInfo &SI = VI.Slices[DwordIdx];
+          const ValueSpecializer::DwordSpecializationInfo &DSI = VSI.DwordInfos[DwordIdx];
+          if (DSI.Kind == ValueSpecializer::SpecializationKind::Constant) {
+            if (SI.Status != ValueTracking::SliceStatus::Constant || SI.ConstantValue != DSI.ConstantValue)
+              report_fatal_error("Failed constant specialization");
+          }
+          if (DSI.Kind == ValueSpecializer::SpecializationKind::FrozenPoison) {
+            if (SI.Status != ValueTracking::SliceStatus::UndefOrPoison) {
+              report_fatal_error("Failed frozen poison specialization");
             }
           }
         }
+      }
 
-        dbgs() << "[VS]: Replaced " << NumReplacedDwords << " dwords in ";
-        VSI.Val->printAsOperand(dbgs());
-        if (Replacement) {
-          dbgs() << ", replaced by ";
-          Replacement->printAsOperand(dbgs());
-        }
-        dbgs() << "\n";
+      dbgs() << "[VS]: Replaced " << NumReplacedDwords << " dwords in ";
+      VSI.Val->printAsOperand(dbgs());
+      if (Replacement) {
+        dbgs() << ", replaced by ";
+        Replacement->printAsOperand(dbgs());
       }
+      dbgs() << "\n";
     }
   }
 
-  for (auto *CI : ToBeDeleted)
-    CI->eraseFromParent();
-
   return PreservedAnalyses::none();
 }
 
diff --git a/compilerutils/test/cross-module-inliner/inc/link-struct-ptr-argument.ll b/compilerutils/test/cross-module-inliner/inc/link-struct-ptr-argument.ll
new file mode 100644
index 0000000000..6969eaf9e5
--- /dev/null
+++ b/compilerutils/test/cross-module-inliner/inc/link-struct-ptr-argument.ll
@@ -0,0 +1,12 @@
+%struct.MyClass.1 = type { i32 }
+
+define i32 @inline_fun(ptr nocapture readonly %c) {
+  %1 = getelementptr inbounds %struct.MyClass.1, ptr %c, i32 0, i32 0
+  %result = load i32, ptr %1, align 4
+  ret i32 %result
+}
+
+define i32 @inline_fun_struct(%struct.MyClass.1 %c) {
+  %result = extractvalue %struct.MyClass.1 %c, 0
+  ret i32 %result
+}
diff --git a/compilerutils/test/cross-module-inliner/link-struct-ptr-argument.ll b/compilerutils/test/cross-module-inliner/link-struct-ptr-argument.ll
new file mode 100644
index 0000000000..55c1c3f60c
--- /dev/null
+++ b/compilerutils/test/cross-module-inliner/link-struct-ptr-argument.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool cross-module-inline --version 3
+; RUN: cross-module-inline %s %S/inc/link-struct-ptr-argument.ll --link inline_fun --link inline_fun_struct | FileCheck %s
+;
+; Inline a function with struct arguemtnt passed as pointer. Check that return is loaded from pointer.
+
+%struct.MyClass = type { i32 }
+
+declare !pointeetys !0 i32 @inline_fun(ptr)
+declare i32 @inline_fun_struct(%struct.MyClass)
+
+define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_MYCLASS]], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    store i32 5, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[RESULT_I:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[S:%.*]] = load %struct.MyClass, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[RESULT_I1:%.*]] = extractvalue %struct.MyClass [[S]], 0
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[RESULT_I]], [[RESULT_I1]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %1 = alloca %struct.MyClass, align 4
+  %2 = getelementptr inbounds %struct.MyClass, ptr %1, i32 0, i32 0
+  store i32 5, ptr %2, align 4
+  %result = call i32 @inline_fun(ptr nonnull %1)
+  %s = load %struct.MyClass, ptr %1
+  %result2 = call i32 @inline_fun_struct(%struct.MyClass %s)
+  %add = add i32 %result, %result2
+  ret i32 %add
+}
+
+!0 = !{%struct.MyClass poison}
diff --git a/compilerutils/test/value-origin-tracking/basic-tests.ll b/compilerutils/test/value-origin-tracking/basic-tests.ll
index d722c02f77..c77587df09 100644
--- a/compilerutils/test/value-origin-tracking/basic-tests.ll
+++ b/compilerutils/test/value-origin-tracking/basic-tests.ll
@@ -131,8 +131,9 @@ define void @testPoison() {
 ; CHECK: (double poison): UndefOrPoison; UndefOrPoison
   call void @analyze(double poison)
 
+; See freeze-mode.ll for detailed freeze tests.
   %freezePoison = freeze i32 poison
-; CHECK: (  %freezePoison = {{.*}}): UndefOrPoison
+; CHECK: (  %freezePoison = {{.*}}): Dynamic
   call void @analyze(i32 %freezePoison)
 
   %freezeNonPoison = freeze i32 5
diff --git a/compilerutils/test/value-origin-tracking/freeze-mode.ll b/compilerutils/test/value-origin-tracking/freeze-mode.ll
new file mode 100644
index 0000000000..7b50418c28
--- /dev/null
+++ b/compilerutils/test/value-origin-tracking/freeze-mode.ll
@@ -0,0 +1,70 @@
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-freeze-mode=0 | FileCheck %s --check-prefix=DYNAMIC
+; RUN: opt -passes="value-origin-tracking-test" -S %s -value-origin-tracking-test-freeze-mode=1 | FileCheck %s --check-prefix=FORWARD
+
+declare void @analyze(...)
+
+define void @testSimpleFreeze() {
+; CHECK-LABEL: testSimpleFreeze
+  %freeze = freeze i32 poison
+; DYNAMIC: %freeze = {{.*}}: Dynamic
+; FORWARD: %freeze = {{.*}}: UndefOrPoison
+  call void @analyze(i32 %freeze)
+  ret void
+}
+
+define void @testSelectMultipleFreezes(i1 %cond) {
+; CHECK-LABEL: testSelectMultipleFreezes
+  %freeze.0 = freeze i32 poison
+  %freeze.1 = freeze i32 poison
+  %merged = select i1 %cond, i32 %freeze.0, i32 %freeze.1
+; DYNAMIC: %merged = {{.*}}: Dynamic
+; FORWARD: %merged = {{.*}}: UndefOrPoison
+  call void @analyze(i32 %merged)
+  ret void
+}
+
+define void @testSelectFreezeWithConstant(i1 %cond) {
+; CHECK-LABEL: testSelectFreezeWithConstant
+  %freeze.0 = freeze i32 poison
+  %freeze.1 = freeze i32 poison
+  %merged.with.0 = select i1 %cond, i32 %freeze.0, i32 0
+  %merged.with.1 = select i1 %cond, i32 %freeze.1, i32 1
+; DYNAMIC: %merged.with.0 = {{.*}}: (Constant: 0x0 | Dynamic: {{.*}})
+; FORWARD: %merged.with.0 = {{.*}}: (UndefOrPoison | Constant: 0x0)
+  call void @analyze(i32 %merged.with.0)
+; DYNAMIC: %merged.with.1 = {{.*}}: (Constant: 0x1 | Dynamic: {{.*}})
+; FORWARD: %merged.with.1 = {{.*}}: (UndefOrPoison | Constant: 0x1)
+  call void @analyze(i32 %merged.with.1)
+  ret void
+}
+
+define void @testFreezeNonPoison(i1 %cond, i32 %arg) {
+; CHECK-LABEL: testFreezeNonPoison
+  %add = add i32 1, 1
+; DYNAMIC: %add = {{.*}}: Constant: 0x2
+; FORWARD: %add = {{.*}}: Constant: 0x2
+  call void @analyze(i32 %add)
+  %frozen.add = freeze i32 %add
+; DYNAMIC: %frozen.add = {{.*}}: Constant: 0x2
+; FORWARD: %frozen.add = {{.*}}: Constant: 0x2
+  call void @analyze(i32 %frozen.add)
+
+  %arg.or.constant = select i1 %cond, i32 15, i32 %arg
+; DYNAMIC: %arg.or.constant = {{.*}}: (Constant: 0xf | Dynamic (argument): {{.*}})
+; FORWARD: %arg.or.constant = {{.*}}: (Constant: 0xf | Dynamic (argument): {{.*}})
+  call void @analyze(i32 %arg.or.constant)
+  %arg.or.constant.frozen = freeze i32 %arg.or.constant
+; DYNAMIC: %arg.or.constant.frozen = {{.*}}: (Constant: 0xf | Dynamic (argument): {{.*}})
+; FORWARD: %arg.or.constant.frozen = {{.*}}: (Constant: 0xf | Dynamic (argument): {{.*}})
+  call void @analyze(i32 %arg.or.constant.frozen)
+
+  %arg.or.poison = select i1 %cond, i32 poison, i32 %arg
+; DYNAMIC: %arg.or.poison = {{.*}}: (UndefOrPoison | Dynamic (argument): {{.*}})
+; FORWARD: %arg.or.poison = {{.*}}: (UndefOrPoison | Dynamic (argument): {{.*}})
+  call void @analyze(i32 %arg.or.poison)
+  %arg.or.poison.frozen = freeze i32 %arg.or.poison
+; DYNAMIC: %arg.or.poison.frozen = {{.*}}: Dynamic
+; FORWARD: %arg.or.poison.frozen = {{.*}}: (UndefOrPoison | Dynamic (argument): {{.*}})
+  call void @analyze(i32 %arg.or.poison.frozen)
+  ret void
+}
diff --git a/compilerutils/tool/cross-module-inline/cross-module-inline.cpp b/compilerutils/tool/cross-module-inline/cross-module-inline.cpp
index affe2e470d..18e59b5315 100644
--- a/compilerutils/tool/cross-module-inline/cross-module-inline.cpp
+++ b/compilerutils/tool/cross-module-inline/cross-module-inline.cpp
@@ -111,7 +111,7 @@ int main(int argc, char **argv) {
       if (auto *CInst = dyn_cast<CallInst>(Use.getUser())) {
         if (CInst->isCallee(&Use)) {
           // Change call target to other module
-          Use = targetF;
+          CInst->setCalledFunction(targetF);
 
           inliner.inlineCall(*CInst);
         }
diff --git a/docs/ComputeShaderDerivateGroups.md b/docs/ComputeShaderDerivateGroups.md
index 782b83be5f..7e52c7cd92 100644
--- a/docs/ComputeShaderDerivateGroups.md
+++ b/docs/ComputeShaderDerivateGroups.md
@@ -116,12 +116,12 @@ During implementation, some interfaces and structures should be changed as follo
 1. Replace `ResourceUsage::builtInUsage.cs.workgroupLayout` with a `ResourceUsage::builtInUsage.cs.foldWorkgroupXY` boolean. The `Gfx[69]ConfigBuilder` will use this boolean instead of `workgroupLayout`.
 2. Add `ComputeShaderMode::derivatives` field with three enum values: `None`, `Linear`, `Quads`. This field is populated by the SPIRV reader.
 3. Change the `InOutBuilder` to always insert a call to `lgc.reconfigure.local.invocation.id` for `BuiltInLocalInvocationId`. Remove `lgc.swizzle.local.invocation.id`.
-4. `PatchInOutImportExport::processShader` will handle `lgc.reconfigure.local.invocation.id` similar to today. However:
+4. `LowerInOut::processShader` will handle `lgc.reconfigure.local.invocation.id` similar to today. However:
    * Take the `ComputeShaderMode::derivatives` field into account when determining the workgroup layout to use
    * Integrate `swizzleLocalInvocationIdIn8x4`; basically, the decision of micro- vs. macro-tiling becomes explicitly orthogonal. Instead of `WorkgroupLayout::{Linear, Quads, SexagintiQuads}` and then 8x4 swizzling, there is only `Linear` vs. `Quads` for the micro-tiling and then `Linear` vs. `Block8` for the macro-tiling.
    * The `forceCsThreadIdSwizzling` pipeline option can stay the same for compatibility, it is simply used as an input to the determination of which layout to use.
 
-Note that all decision-making is centralized in one place, which is `PatchInOutImportExport`: the inputs of pipeline options as well as derivatives mode are used once to determine the workgroup layout.
+Note that all decision-making is centralized in one place, which is `LowerInOut`: the inputs of pipeline options as well as derivatives mode are used once to determine the workgroup layout.
 
 # Issues
 ## Helper invocations support
diff --git a/gfxruntime/CMakeLists.txt b/gfxruntime/CMakeLists.txt
index 382ff8478c..2d89d7c6e3 100644
--- a/gfxruntime/CMakeLists.txt
+++ b/gfxruntime/CMakeLists.txt
@@ -39,6 +39,7 @@ find_package(Python3
 # Locate dxc binary.
 #if _WIN32
 if(WIN32)
+    set(DXC_PATH "$ENV{DK_ROOT}/DirectXShaderCompiler/8c9d92b/bin")
     if (NOT EXISTS "${DXC_PATH}")
         message(FATAL_ERROR "Unable to find DirectXShaderCompiler directory: ${DXC_PATH}")
     endif()
diff --git a/imported/llvm-dialects b/imported/llvm-dialects
index bdfb113d8d..c436594690 160000
--- a/imported/llvm-dialects
+++ b/imported/llvm-dialects
@@ -1 +1 @@
-Subproject commit bdfb113d8d765bdf4554a2b30ae909b93f26aeea
+Subproject commit c4365946902436063f872dbcf1a370fe73982a54
diff --git a/include/khronos/spirv/GLSL.std.450.h b/include/khronos/spirv/GLSL.std.450.h
index 54cc00e9a8..0594f907a1 100644
--- a/include/khronos/spirv/GLSL.std.450.h
+++ b/include/khronos/spirv/GLSL.std.450.h
@@ -1,5 +1,5 @@
 /*
-** Copyright (c) 2014-2016 The Khronos Group Inc.
+** Copyright (c) 2014-2024 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a copy
 ** of this software and/or associated documentation files (the "Materials"),
diff --git a/include/khronos/spirv/NonSemanticDebugBreak.h b/include/khronos/spirv/NonSemanticDebugBreak.h
index 6ec2b5bb39..8604fe7842 100644
--- a/include/khronos/spirv/NonSemanticDebugBreak.h
+++ b/include/khronos/spirv/NonSemanticDebugBreak.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 The Khronos Group Inc.
+// Copyright (c) 2020-2024 The Khronos Group Inc.
 // 
 // Permission is hereby granted, free of charge, to any person obtaining a
 // copy of this software and/or associated documentation files (the
diff --git a/include/khronos/spirv/NonSemanticDebugPrintf.h b/include/khronos/spirv/NonSemanticDebugPrintf.h
index 3ca7247f2b..bc24683ec0 100644
--- a/include/khronos/spirv/NonSemanticDebugPrintf.h
+++ b/include/khronos/spirv/NonSemanticDebugPrintf.h
@@ -1,5 +1,5 @@
-// Copyright (c) 2020 The Khronos Group Inc.
-//
+// Copyright (c) 2020-2024 The Khronos Group Inc.
+// 
 // Permission is hereby granted, free of charge, to any person obtaining a
 // copy of this software and/or associated documentation files (the
 // "Materials"), to deal in the Materials without restriction, including
@@ -7,15 +7,15 @@
 // distribute, sublicense, and/or sell copies of the Materials, and to
 // permit persons to whom the Materials are furnished to do so, subject to
 // the following conditions:
-//
+// 
 // The above copyright notice and this permission notice shall be included
 // in all copies or substantial portions of the Materials.
-//
+// 
 // MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 // KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 // SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 //    https://www.khronos.org/registry/
-//
+// 
 // THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -23,7 +23,7 @@
 // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 // MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
-//
+// 
 
 #ifndef SPIRV_UNIFIED1_NonSemanticDebugPrintf_H_
 #define SPIRV_UNIFIED1_NonSemanticDebugPrintf_H_
diff --git a/include/khronos/spirv/NonSemanticShaderDebugInfo100.h b/include/khronos/spirv/NonSemanticShaderDebugInfo100.h
index c52f32f809..b276b560cb 100644
--- a/include/khronos/spirv/NonSemanticShaderDebugInfo100.h
+++ b/include/khronos/spirv/NonSemanticShaderDebugInfo100.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 The Khronos Group Inc.
+// Copyright (c) 2018-2024 The Khronos Group Inc.
 // 
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and/or associated documentation files (the "Materials"),
diff --git a/include/khronos/spirv/spirv.hpp b/include/khronos/spirv/spirv.hpp
index b9c8743371..1065013035 100644
--- a/include/khronos/spirv/spirv.hpp
+++ b/include/khronos/spirv/spirv.hpp
@@ -190,7 +190,9 @@ enum ExecutionMode {
     ExecutionModeOutputLinesNV = 5269,
     ExecutionModeOutputPrimitivesEXT = 5270,
     ExecutionModeOutputPrimitivesNV = 5270,
+    ExecutionModeDerivativeGroupQuadsKHR = 5289,
     ExecutionModeDerivativeGroupQuadsNV = 5289,
+    ExecutionModeDerivativeGroupLinearKHR = 5290,
     ExecutionModeDerivativeGroupLinearNV = 5290,
     ExecutionModeOutputTrianglesEXT = 5298,
     ExecutionModeOutputTrianglesNV = 5298,
@@ -215,6 +217,9 @@ enum ExecutionMode {
     ExecutionModeStreamingInterfaceINTEL = 6154,
     ExecutionModeRegisterMapInterfaceINTEL = 6160,
     ExecutionModeNamedBarrierCountINTEL = 6417,
+    ExecutionModeMaximumRegistersINTEL = 6461,
+    ExecutionModeMaximumRegistersIdINTEL = 6462,
+    ExecutionModeNamedMaximumRegistersINTEL = 6463,
     ExecutionModeMax = 0x7fffffff,
 };
 
@@ -374,6 +379,7 @@ enum ImageChannelDataType {
     ImageChannelDataTypeUnormInt101010_2 = 16,
     ImageChannelDataTypeUnsignedIntRaw10EXT = 19,
     ImageChannelDataTypeUnsignedIntRaw12EXT = 20,
+    ImageChannelDataTypeUnormInt2_101010EXT = 21,
     ImageChannelDataTypeMax = 0x7fffffff,
 };
 
@@ -540,6 +546,7 @@ enum Decoration {
     DecorationNoUnsignedWrap = 4470,
     DecorationWeightTextureQCOM = 4487,
     DecorationBlockMatchTextureQCOM = 4488,
+    DecorationBlockMatchSamplerQCOM = 4499,
     DecorationExplicitInterpAMD = 4999,
     DecorationNodeSharesPayloadLimitsWithAMDX = 5019,
     DecorationNodeMaxPayloadsAMDX = 5020,
@@ -1041,6 +1048,7 @@ enum Capability {
     CapabilityTileImageColorReadAccessEXT = 4166,
     CapabilityTileImageDepthReadAccessEXT = 4167,
     CapabilityTileImageStencilReadAccessEXT = 4168,
+    CapabilityCooperativeMatrixLayoutsARM = 4201,
     CapabilityFragmentShadingRateKHR = 4422,
     CapabilitySubgroupBallotKHR = 4423,
     CapabilityDrawParameters = 4427,
@@ -1070,11 +1078,13 @@ enum Capability {
     CapabilityRoundingModeRTZ = 4468,
     CapabilityRayQueryProvisionalKHR = 4471,
     CapabilityRayQueryKHR = 4472,
+    CapabilityUntypedPointersKHR = 4473,
     CapabilityRayTraversalPrimitiveCullingKHR = 4478,
     CapabilityRayTracingKHR = 4479,
     CapabilityTextureSampleWeightedQCOM = 4484,
     CapabilityTextureBoxFilterQCOM = 4485,
     CapabilityTextureBlockMatchQCOM = 4486,
+    CapabilityTextureBlockMatch2QCOM = 4498,
     CapabilityFloat16ImageAMD = 5008,
     CapabilityImageGatherBiasLodAMD = 5009,
     CapabilityFragmentMaskAMD = 5010,
@@ -1097,6 +1107,7 @@ enum Capability {
     CapabilityMeshShadingEXT = 5283,
     CapabilityFragmentBarycentricKHR = 5284,
     CapabilityFragmentBarycentricNV = 5284,
+    CapabilityComputeDerivativeGroupQuadsKHR = 5288,
     CapabilityComputeDerivativeGroupQuadsNV = 5288,
     CapabilityFragmentDensityEXT = 5291,
     CapabilityShadingRateNV = 5291,
@@ -1134,6 +1145,7 @@ enum Capability {
     CapabilityVulkanMemoryModelDeviceScopeKHR = 5346,
     CapabilityPhysicalStorageBufferAddresses = 5347,
     CapabilityPhysicalStorageBufferAddressesEXT = 5347,
+    CapabilityComputeDerivativeGroupLinearKHR = 5350,
     CapabilityComputeDerivativeGroupLinearNV = 5350,
     CapabilityRayTracingProvisionalKHR = 5353,
     CapabilityCooperativeMatrixNV = 5357,
@@ -1148,7 +1160,9 @@ enum Capability {
     CapabilityShaderInvocationReorderNV = 5383,
     CapabilityBindlessTextureNV = 5390,
     CapabilityRayQueryPositionFetchKHR = 5391,
+    CapabilityAtomicFloat16VectorNV = 5404,
     CapabilityRayTracingDisplacementMicromapNV = 5409,
+    CapabilityRawAccessChainsNV = 5414,
     CapabilitySubgroupShuffleINTEL = 5568,
     CapabilitySubgroupBufferBlockIOINTEL = 5569,
     CapabilitySubgroupImageBlockIOINTEL = 5570,
@@ -1201,6 +1215,7 @@ enum Capability {
     CapabilityDotProductKHR = 6019,
     CapabilityRayCullMaskKHR = 6020,
     CapabilityCooperativeMatrixKHR = 6022,
+    CapabilityReplicatedCompositesEXT = 6024,
     CapabilityBitInstructions = 6025,
     CapabilityGroupNonUniformRotateKHR = 6026,
     CapabilityFloatControls2 = 6029,
@@ -1219,9 +1234,11 @@ enum Capability {
     CapabilityFPGAArgumentInterfacesINTEL = 6174,
     CapabilityGlobalVariableHostAccessINTEL = 6187,
     CapabilityGlobalVariableFPGADecorationsINTEL = 6189,
+    CapabilitySubgroupBufferPrefetchINTEL = 6220,
     CapabilityGroupUniformArithmeticKHR = 6400,
     CapabilityMaskedGatherScatterINTEL = 6427,
     CapabilityCacheControlsINTEL = 6441,
+    CapabilityRegisterLimitsINTEL = 6460,
     CapabilityMax = 0x7fffffff,
 };
 
@@ -1349,6 +1366,8 @@ enum CooperativeMatrixOperandsMask {
 enum CooperativeMatrixLayout {
     CooperativeMatrixLayoutRowMajorKHR = 0,
     CooperativeMatrixLayoutColumnMajorKHR = 1,
+    CooperativeMatrixLayoutRowBlockedInterleavedARM = 4202,
+    CooperativeMatrixLayoutColumnBlockedInterleavedARM = 4203,
     CooperativeMatrixLayoutMax = 0x7fffffff,
 };
 
@@ -1390,6 +1409,27 @@ enum StoreCacheControl {
     StoreCacheControlMax = 0x7fffffff,
 };
 
+enum NamedMaximumNumberOfRegisters {
+    NamedMaximumNumberOfRegistersAutoINTEL = 0,
+    NamedMaximumNumberOfRegistersMax = 0x7fffffff,
+};
+
+enum RawAccessChainOperandsShift {
+    RawAccessChainOperandsRobustnessPerComponentNVShift = 0,
+    RawAccessChainOperandsRobustnessPerElementNVShift = 1,
+    RawAccessChainOperandsMax = 0x7fffffff,
+};
+
+enum RawAccessChainOperandsMask {
+    RawAccessChainOperandsMaskNone = 0,
+    RawAccessChainOperandsRobustnessPerComponentNVMask = 0x00000001,
+    RawAccessChainOperandsRobustnessPerElementNVMask = 0x00000002,
+};
+
+enum FPEncoding {
+    FPEncodingMax = 0x7fffffff,
+};
+
 enum Op {
     OpNop = 0,
     OpUndef = 1,
@@ -1739,13 +1779,22 @@ enum Op {
     OpDepthAttachmentReadEXT = 4161,
     OpStencilAttachmentReadEXT = 4162,
     OpTerminateInvocation = 4416,
+    OpTypeUntypedPointerKHR = 4417,
+    OpUntypedVariableKHR = 4418,
+    OpUntypedAccessChainKHR = 4419,
+    OpUntypedInBoundsAccessChainKHR = 4420,
     OpSubgroupBallotKHR = 4421,
     OpSubgroupFirstInvocationKHR = 4422,
+    OpUntypedPtrAccessChainKHR = 4423,
+    OpUntypedInBoundsPtrAccessChainKHR = 4424,
+    OpUntypedArrayLengthKHR = 4425,
+    OpUntypedPrefetchKHR = 4426,
     OpSubgroupAllKHR = 4428,
     OpSubgroupAnyKHR = 4429,
     OpSubgroupAllEqualKHR = 4430,
     OpGroupNonUniformRotateKHR = 4431,
     OpSubgroupReadInvocationKHR = 4432,
+    OpExtInstWithForwardRefsKHR = 4433,
     OpTraceRayKHR = 4445,
     OpExecuteCallableKHR = 4446,
     OpConvertUToAccelerationStructureKHR = 4447,
@@ -1768,6 +1817,9 @@ enum Op {
     OpCooperativeMatrixStoreKHR = 4458,
     OpCooperativeMatrixMulAddKHR = 4459,
     OpCooperativeMatrixLengthKHR = 4460,
+    OpConstantCompositeReplicateEXT = 4461,
+    OpSpecConstantCompositeReplicateEXT = 4462,
+    OpCompositeConstructReplicateEXT = 4463,
     OpTypeRayQueryKHR = 4472,
     OpRayQueryInitializeKHR = 4473,
     OpRayQueryTerminateKHR = 4474,
@@ -1779,6 +1831,10 @@ enum Op {
     OpImageBoxFilterQCOM = 4481,
     OpImageBlockMatchSSDQCOM = 4482,
     OpImageBlockMatchSADQCOM = 4483,
+    OpImageBlockMatchWindowSSDQCOM = 4500,
+    OpImageBlockMatchWindowSADQCOM = 4501,
+    OpImageBlockMatchGatherSSDQCOM = 4502,
+    OpImageBlockMatchGatherSADQCOM = 4503,
     OpGroupIAddNonUniformAMD = 5000,
     OpGroupFAddNonUniformAMD = 5001,
     OpGroupFMinNonUniformAMD = 5002,
@@ -1863,6 +1919,7 @@ enum Op {
     OpConvertUToSampledImageNV = 5395,
     OpConvertSampledImageToUNV = 5396,
     OpSamplerImageAddressingModeNV = 5397,
+    OpRawAccessChainNV = 5398,
     OpSubgroupShuffleINTEL = 5571,
     OpSubgroupShuffleDownINTEL = 5572,
     OpSubgroupShuffleUpINTEL = 5573,
@@ -2109,6 +2166,7 @@ enum Op {
     OpConvertBF16ToFINTEL = 6117,
     OpControlBarrierArriveINTEL = 6142,
     OpControlBarrierWaitINTEL = 6143,
+    OpSubgroupBlockPrefetchINTEL = 6221,
     OpGroupIMulKHR = 6401,
     OpGroupFMulKHR = 6402,
     OpGroupBitwiseAndKHR = 6403,
@@ -2478,13 +2536,22 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpDepthAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
     case OpStencilAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
     case OpTerminateInvocation: *hasResult = false; *hasResultType = false; break;
+    case OpTypeUntypedPointerKHR: *hasResult = true; *hasResultType = false; break;
+    case OpUntypedVariableKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedAccessChainKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedInBoundsAccessChainKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupBallotKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupFirstInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedPtrAccessChainKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedInBoundsPtrAccessChainKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedArrayLengthKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedPrefetchKHR: *hasResult = false; *hasResultType = false; break;
     case OpSubgroupAllKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupAnyKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupAllEqualKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupNonUniformRotateKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupReadInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case OpExtInstWithForwardRefsKHR: *hasResult = true; *hasResultType = true; break;
     case OpTraceRayKHR: *hasResult = false; *hasResultType = false; break;
     case OpExecuteCallableKHR: *hasResult = false; *hasResultType = false; break;
     case OpConvertUToAccelerationStructureKHR: *hasResult = true; *hasResultType = true; break;
@@ -2501,6 +2568,9 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpCooperativeMatrixStoreKHR: *hasResult = false; *hasResultType = false; break;
     case OpCooperativeMatrixMulAddKHR: *hasResult = true; *hasResultType = true; break;
     case OpCooperativeMatrixLengthKHR: *hasResult = true; *hasResultType = true; break;
+    case OpConstantCompositeReplicateEXT: *hasResult = true; *hasResultType = true; break;
+    case OpSpecConstantCompositeReplicateEXT: *hasResult = true; *hasResultType = true; break;
+    case OpCompositeConstructReplicateEXT: *hasResult = true; *hasResultType = true; break;
     case OpTypeRayQueryKHR: *hasResult = true; *hasResultType = false; break;
     case OpRayQueryInitializeKHR: *hasResult = false; *hasResultType = false; break;
     case OpRayQueryTerminateKHR: *hasResult = false; *hasResultType = false; break;
@@ -2512,6 +2582,10 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpImageBoxFilterQCOM: *hasResult = true; *hasResultType = true; break;
     case OpImageBlockMatchSSDQCOM: *hasResult = true; *hasResultType = true; break;
     case OpImageBlockMatchSADQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchWindowSSDQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchWindowSADQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchGatherSSDQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchGatherSADQCOM: *hasResult = true; *hasResultType = true; break;
     case OpGroupIAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
     case OpGroupFAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
     case OpGroupFMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
@@ -2568,14 +2642,14 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpWritePackedPrimitiveIndices4x8NV: *hasResult = false; *hasResultType = false; break;
     case OpFetchMicroTriangleVertexPositionNV: *hasResult = true; *hasResultType = true; break;
     case OpFetchMicroTriangleVertexBarycentricNV: *hasResult = true; *hasResultType = true; break;
-    case OpReportIntersectionNV: *hasResult = true; *hasResultType = true; break;
+    case OpReportIntersectionKHR: *hasResult = true; *hasResultType = true; break;
     case OpIgnoreIntersectionNV: *hasResult = false; *hasResultType = false; break;
     case OpTerminateRayNV: *hasResult = false; *hasResultType = false; break;
     case OpTraceNV: *hasResult = false; *hasResultType = false; break;
     case OpTraceMotionNV: *hasResult = false; *hasResultType = false; break;
     case OpTraceRayMotionNV: *hasResult = false; *hasResultType = false; break;
     case OpRayQueryGetIntersectionTriangleVertexPositionsKHR: *hasResult = true; *hasResultType = true; break;
-    case OpTypeAccelerationStructureNV: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAccelerationStructureKHR: *hasResult = true; *hasResultType = false; break;
     case OpExecuteCallableNV: *hasResult = false; *hasResultType = false; break;
     case OpTypeCooperativeMatrixNV: *hasResult = true; *hasResultType = false; break;
     case OpCooperativeMatrixLoadNV: *hasResult = true; *hasResultType = true; break;
@@ -2593,6 +2667,7 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpConvertUToSampledImageNV: *hasResult = true; *hasResultType = true; break;
     case OpConvertSampledImageToUNV: *hasResult = true; *hasResultType = true; break;
     case OpSamplerImageAddressingModeNV: *hasResult = false; *hasResultType = false; break;
+    case OpRawAccessChainNV: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupShuffleINTEL: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupShuffleDownINTEL: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupShuffleUpINTEL: *hasResult = true; *hasResultType = true; break;
@@ -2837,6 +2912,7 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpConvertBF16ToFINTEL: *hasResult = true; *hasResultType = true; break;
     case OpControlBarrierArriveINTEL: *hasResult = false; *hasResultType = false; break;
     case OpControlBarrierWaitINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroupBlockPrefetchINTEL: *hasResult = false; *hasResultType = false; break;
     case OpGroupIMulKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupFMulKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupBitwiseAndKHR: *hasResult = true; *hasResultType = true; break;
@@ -2849,6 +2925,1805 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpMaskedScatterINTEL: *hasResult = false; *hasResultType = false; break;
     }
 }
+inline const char* SourceLanguageToString(SourceLanguage value) {
+    switch (value) {
+    case SourceLanguageUnknown: return "Unknown";
+    case SourceLanguageESSL: return "ESSL";
+    case SourceLanguageGLSL: return "GLSL";
+    case SourceLanguageOpenCL_C: return "OpenCL_C";
+    case SourceLanguageOpenCL_CPP: return "OpenCL_CPP";
+    case SourceLanguageHLSL: return "HLSL";
+    case SourceLanguageCPP_for_OpenCL: return "CPP_for_OpenCL";
+    case SourceLanguageSYCL: return "SYCL";
+    case SourceLanguageHERO_C: return "HERO_C";
+    case SourceLanguageNZSL: return "NZSL";
+    case SourceLanguageWGSL: return "WGSL";
+    case SourceLanguageSlang: return "Slang";
+    case SourceLanguageZig: return "Zig";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ExecutionModelToString(ExecutionModel value) {
+    switch (value) {
+    case ExecutionModelVertex: return "Vertex";
+    case ExecutionModelTessellationControl: return "TessellationControl";
+    case ExecutionModelTessellationEvaluation: return "TessellationEvaluation";
+    case ExecutionModelGeometry: return "Geometry";
+    case ExecutionModelFragment: return "Fragment";
+    case ExecutionModelGLCompute: return "GLCompute";
+    case ExecutionModelKernel: return "Kernel";
+    case ExecutionModelTaskNV: return "TaskNV";
+    case ExecutionModelMeshNV: return "MeshNV";
+    case ExecutionModelRayGenerationKHR: return "RayGenerationKHR";
+    case ExecutionModelIntersectionKHR: return "IntersectionKHR";
+    case ExecutionModelAnyHitKHR: return "AnyHitKHR";
+    case ExecutionModelClosestHitKHR: return "ClosestHitKHR";
+    case ExecutionModelMissKHR: return "MissKHR";
+    case ExecutionModelCallableKHR: return "CallableKHR";
+    case ExecutionModelTaskEXT: return "TaskEXT";
+    case ExecutionModelMeshEXT: return "MeshEXT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* AddressingModelToString(AddressingModel value) {
+    switch (value) {
+    case AddressingModelLogical: return "Logical";
+    case AddressingModelPhysical32: return "Physical32";
+    case AddressingModelPhysical64: return "Physical64";
+    case AddressingModelPhysicalStorageBuffer64: return "PhysicalStorageBuffer64";
+    default: return "Unknown";
+    }
+}
+
+inline const char* MemoryModelToString(MemoryModel value) {
+    switch (value) {
+    case MemoryModelSimple: return "Simple";
+    case MemoryModelGLSL450: return "GLSL450";
+    case MemoryModelOpenCL: return "OpenCL";
+    case MemoryModelVulkan: return "Vulkan";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ExecutionModeToString(ExecutionMode value) {
+    switch (value) {
+    case ExecutionModeInvocations: return "Invocations";
+    case ExecutionModeSpacingEqual: return "SpacingEqual";
+    case ExecutionModeSpacingFractionalEven: return "SpacingFractionalEven";
+    case ExecutionModeSpacingFractionalOdd: return "SpacingFractionalOdd";
+    case ExecutionModeVertexOrderCw: return "VertexOrderCw";
+    case ExecutionModeVertexOrderCcw: return "VertexOrderCcw";
+    case ExecutionModePixelCenterInteger: return "PixelCenterInteger";
+    case ExecutionModeOriginUpperLeft: return "OriginUpperLeft";
+    case ExecutionModeOriginLowerLeft: return "OriginLowerLeft";
+    case ExecutionModeEarlyFragmentTests: return "EarlyFragmentTests";
+    case ExecutionModePointMode: return "PointMode";
+    case ExecutionModeXfb: return "Xfb";
+    case ExecutionModeDepthReplacing: return "DepthReplacing";
+    case ExecutionModeDepthGreater: return "DepthGreater";
+    case ExecutionModeDepthLess: return "DepthLess";
+    case ExecutionModeDepthUnchanged: return "DepthUnchanged";
+    case ExecutionModeLocalSize: return "LocalSize";
+    case ExecutionModeLocalSizeHint: return "LocalSizeHint";
+    case ExecutionModeInputPoints: return "InputPoints";
+    case ExecutionModeInputLines: return "InputLines";
+    case ExecutionModeInputLinesAdjacency: return "InputLinesAdjacency";
+    case ExecutionModeTriangles: return "Triangles";
+    case ExecutionModeInputTrianglesAdjacency: return "InputTrianglesAdjacency";
+    case ExecutionModeQuads: return "Quads";
+    case ExecutionModeIsolines: return "Isolines";
+    case ExecutionModeOutputVertices: return "OutputVertices";
+    case ExecutionModeOutputPoints: return "OutputPoints";
+    case ExecutionModeOutputLineStrip: return "OutputLineStrip";
+    case ExecutionModeOutputTriangleStrip: return "OutputTriangleStrip";
+    case ExecutionModeVecTypeHint: return "VecTypeHint";
+    case ExecutionModeContractionOff: return "ContractionOff";
+    case ExecutionModeInitializer: return "Initializer";
+    case ExecutionModeFinalizer: return "Finalizer";
+    case ExecutionModeSubgroupSize: return "SubgroupSize";
+    case ExecutionModeSubgroupsPerWorkgroup: return "SubgroupsPerWorkgroup";
+    case ExecutionModeSubgroupsPerWorkgroupId: return "SubgroupsPerWorkgroupId";
+    case ExecutionModeLocalSizeId: return "LocalSizeId";
+    case ExecutionModeLocalSizeHintId: return "LocalSizeHintId";
+    case ExecutionModeNonCoherentColorAttachmentReadEXT: return "NonCoherentColorAttachmentReadEXT";
+    case ExecutionModeNonCoherentDepthAttachmentReadEXT: return "NonCoherentDepthAttachmentReadEXT";
+    case ExecutionModeNonCoherentStencilAttachmentReadEXT: return "NonCoherentStencilAttachmentReadEXT";
+    case ExecutionModeSubgroupUniformControlFlowKHR: return "SubgroupUniformControlFlowKHR";
+    case ExecutionModePostDepthCoverage: return "PostDepthCoverage";
+    case ExecutionModeDenormPreserve: return "DenormPreserve";
+    case ExecutionModeDenormFlushToZero: return "DenormFlushToZero";
+    case ExecutionModeSignedZeroInfNanPreserve: return "SignedZeroInfNanPreserve";
+    case ExecutionModeRoundingModeRTE: return "RoundingModeRTE";
+    case ExecutionModeRoundingModeRTZ: return "RoundingModeRTZ";
+    case ExecutionModeEarlyAndLateFragmentTestsAMD: return "EarlyAndLateFragmentTestsAMD";
+    case ExecutionModeStencilRefReplacingEXT: return "StencilRefReplacingEXT";
+    case ExecutionModeCoalescingAMDX: return "CoalescingAMDX";
+    case ExecutionModeMaxNodeRecursionAMDX: return "MaxNodeRecursionAMDX";
+    case ExecutionModeStaticNumWorkgroupsAMDX: return "StaticNumWorkgroupsAMDX";
+    case ExecutionModeShaderIndexAMDX: return "ShaderIndexAMDX";
+    case ExecutionModeMaxNumWorkgroupsAMDX: return "MaxNumWorkgroupsAMDX";
+    case ExecutionModeStencilRefUnchangedFrontAMD: return "StencilRefUnchangedFrontAMD";
+    case ExecutionModeStencilRefGreaterFrontAMD: return "StencilRefGreaterFrontAMD";
+    case ExecutionModeStencilRefLessFrontAMD: return "StencilRefLessFrontAMD";
+    case ExecutionModeStencilRefUnchangedBackAMD: return "StencilRefUnchangedBackAMD";
+    case ExecutionModeStencilRefGreaterBackAMD: return "StencilRefGreaterBackAMD";
+    case ExecutionModeStencilRefLessBackAMD: return "StencilRefLessBackAMD";
+    case ExecutionModeQuadDerivativesKHR: return "QuadDerivativesKHR";
+    case ExecutionModeRequireFullQuadsKHR: return "RequireFullQuadsKHR";
+    case ExecutionModeOutputLinesEXT: return "OutputLinesEXT";
+    case ExecutionModeOutputPrimitivesEXT: return "OutputPrimitivesEXT";
+    case ExecutionModeDerivativeGroupQuadsKHR: return "DerivativeGroupQuadsKHR";
+    case ExecutionModeDerivativeGroupLinearKHR: return "DerivativeGroupLinearKHR";
+    case ExecutionModeOutputTrianglesEXT: return "OutputTrianglesEXT";
+    case ExecutionModePixelInterlockOrderedEXT: return "PixelInterlockOrderedEXT";
+    case ExecutionModePixelInterlockUnorderedEXT: return "PixelInterlockUnorderedEXT";
+    case ExecutionModeSampleInterlockOrderedEXT: return "SampleInterlockOrderedEXT";
+    case ExecutionModeSampleInterlockUnorderedEXT: return "SampleInterlockUnorderedEXT";
+    case ExecutionModeShadingRateInterlockOrderedEXT: return "ShadingRateInterlockOrderedEXT";
+    case ExecutionModeShadingRateInterlockUnorderedEXT: return "ShadingRateInterlockUnorderedEXT";
+    case ExecutionModeSharedLocalMemorySizeINTEL: return "SharedLocalMemorySizeINTEL";
+    case ExecutionModeRoundingModeRTPINTEL: return "RoundingModeRTPINTEL";
+    case ExecutionModeRoundingModeRTNINTEL: return "RoundingModeRTNINTEL";
+    case ExecutionModeFloatingPointModeALTINTEL: return "FloatingPointModeALTINTEL";
+    case ExecutionModeFloatingPointModeIEEEINTEL: return "FloatingPointModeIEEEINTEL";
+    case ExecutionModeMaxWorkgroupSizeINTEL: return "MaxWorkgroupSizeINTEL";
+    case ExecutionModeMaxWorkDimINTEL: return "MaxWorkDimINTEL";
+    case ExecutionModeNoGlobalOffsetINTEL: return "NoGlobalOffsetINTEL";
+    case ExecutionModeNumSIMDWorkitemsINTEL: return "NumSIMDWorkitemsINTEL";
+    case ExecutionModeSchedulerTargetFmaxMhzINTEL: return "SchedulerTargetFmaxMhzINTEL";
+    case ExecutionModeMaximallyReconvergesKHR: return "MaximallyReconvergesKHR";
+    case ExecutionModeFPFastMathDefault: return "FPFastMathDefault";
+    case ExecutionModeStreamingInterfaceINTEL: return "StreamingInterfaceINTEL";
+    case ExecutionModeRegisterMapInterfaceINTEL: return "RegisterMapInterfaceINTEL";
+    case ExecutionModeNamedBarrierCountINTEL: return "NamedBarrierCountINTEL";
+    case ExecutionModeMaximumRegistersINTEL: return "MaximumRegistersINTEL";
+    case ExecutionModeMaximumRegistersIdINTEL: return "MaximumRegistersIdINTEL";
+    case ExecutionModeNamedMaximumRegistersINTEL: return "NamedMaximumRegistersINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* StorageClassToString(StorageClass value) {
+    switch (value) {
+    case StorageClassUniformConstant: return "UniformConstant";
+    case StorageClassInput: return "Input";
+    case StorageClassUniform: return "Uniform";
+    case StorageClassOutput: return "Output";
+    case StorageClassWorkgroup: return "Workgroup";
+    case StorageClassCrossWorkgroup: return "CrossWorkgroup";
+    case StorageClassPrivate: return "Private";
+    case StorageClassFunction: return "Function";
+    case StorageClassGeneric: return "Generic";
+    case StorageClassPushConstant: return "PushConstant";
+    case StorageClassAtomicCounter: return "AtomicCounter";
+    case StorageClassImage: return "Image";
+    case StorageClassStorageBuffer: return "StorageBuffer";
+    case StorageClassTileImageEXT: return "TileImageEXT";
+    case StorageClassNodePayloadAMDX: return "NodePayloadAMDX";
+    case StorageClassNodeOutputPayloadAMDX: return "NodeOutputPayloadAMDX";
+    case StorageClassCallableDataKHR: return "CallableDataKHR";
+    case StorageClassIncomingCallableDataKHR: return "IncomingCallableDataKHR";
+    case StorageClassRayPayloadKHR: return "RayPayloadKHR";
+    case StorageClassHitAttributeKHR: return "HitAttributeKHR";
+    case StorageClassIncomingRayPayloadKHR: return "IncomingRayPayloadKHR";
+    case StorageClassShaderRecordBufferKHR: return "ShaderRecordBufferKHR";
+    case StorageClassPhysicalStorageBuffer: return "PhysicalStorageBuffer";
+    case StorageClassHitObjectAttributeNV: return "HitObjectAttributeNV";
+    case StorageClassTaskPayloadWorkgroupEXT: return "TaskPayloadWorkgroupEXT";
+    case StorageClassCodeSectionINTEL: return "CodeSectionINTEL";
+    case StorageClassDeviceOnlyINTEL: return "DeviceOnlyINTEL";
+    case StorageClassHostOnlyINTEL: return "HostOnlyINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* DimToString(Dim value) {
+    switch (value) {
+    case Dim1D: return "1D";
+    case Dim2D: return "2D";
+    case Dim3D: return "3D";
+    case DimCube: return "Cube";
+    case DimRect: return "Rect";
+    case DimBuffer: return "Buffer";
+    case DimSubpassData: return "SubpassData";
+    case DimTileImageDataEXT: return "TileImageDataEXT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* SamplerAddressingModeToString(SamplerAddressingMode value) {
+    switch (value) {
+    case SamplerAddressingModeNone: return "None";
+    case SamplerAddressingModeClampToEdge: return "ClampToEdge";
+    case SamplerAddressingModeClamp: return "Clamp";
+    case SamplerAddressingModeRepeat: return "Repeat";
+    case SamplerAddressingModeRepeatMirrored: return "RepeatMirrored";
+    default: return "Unknown";
+    }
+}
+
+inline const char* SamplerFilterModeToString(SamplerFilterMode value) {
+    switch (value) {
+    case SamplerFilterModeNearest: return "Nearest";
+    case SamplerFilterModeLinear: return "Linear";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ImageFormatToString(ImageFormat value) {
+    switch (value) {
+    case ImageFormatUnknown: return "Unknown";
+    case ImageFormatRgba32f: return "Rgba32f";
+    case ImageFormatRgba16f: return "Rgba16f";
+    case ImageFormatR32f: return "R32f";
+    case ImageFormatRgba8: return "Rgba8";
+    case ImageFormatRgba8Snorm: return "Rgba8Snorm";
+    case ImageFormatRg32f: return "Rg32f";
+    case ImageFormatRg16f: return "Rg16f";
+    case ImageFormatR11fG11fB10f: return "R11fG11fB10f";
+    case ImageFormatR16f: return "R16f";
+    case ImageFormatRgba16: return "Rgba16";
+    case ImageFormatRgb10A2: return "Rgb10A2";
+    case ImageFormatRg16: return "Rg16";
+    case ImageFormatRg8: return "Rg8";
+    case ImageFormatR16: return "R16";
+    case ImageFormatR8: return "R8";
+    case ImageFormatRgba16Snorm: return "Rgba16Snorm";
+    case ImageFormatRg16Snorm: return "Rg16Snorm";
+    case ImageFormatRg8Snorm: return "Rg8Snorm";
+    case ImageFormatR16Snorm: return "R16Snorm";
+    case ImageFormatR8Snorm: return "R8Snorm";
+    case ImageFormatRgba32i: return "Rgba32i";
+    case ImageFormatRgba16i: return "Rgba16i";
+    case ImageFormatRgba8i: return "Rgba8i";
+    case ImageFormatR32i: return "R32i";
+    case ImageFormatRg32i: return "Rg32i";
+    case ImageFormatRg16i: return "Rg16i";
+    case ImageFormatRg8i: return "Rg8i";
+    case ImageFormatR16i: return "R16i";
+    case ImageFormatR8i: return "R8i";
+    case ImageFormatRgba32ui: return "Rgba32ui";
+    case ImageFormatRgba16ui: return "Rgba16ui";
+    case ImageFormatRgba8ui: return "Rgba8ui";
+    case ImageFormatR32ui: return "R32ui";
+    case ImageFormatRgb10a2ui: return "Rgb10a2ui";
+    case ImageFormatRg32ui: return "Rg32ui";
+    case ImageFormatRg16ui: return "Rg16ui";
+    case ImageFormatRg8ui: return "Rg8ui";
+    case ImageFormatR16ui: return "R16ui";
+    case ImageFormatR8ui: return "R8ui";
+    case ImageFormatR64ui: return "R64ui";
+    case ImageFormatR64i: return "R64i";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ImageChannelOrderToString(ImageChannelOrder value) {
+    switch (value) {
+    case ImageChannelOrderR: return "R";
+    case ImageChannelOrderA: return "A";
+    case ImageChannelOrderRG: return "RG";
+    case ImageChannelOrderRA: return "RA";
+    case ImageChannelOrderRGB: return "RGB";
+    case ImageChannelOrderRGBA: return "RGBA";
+    case ImageChannelOrderBGRA: return "BGRA";
+    case ImageChannelOrderARGB: return "ARGB";
+    case ImageChannelOrderIntensity: return "Intensity";
+    case ImageChannelOrderLuminance: return "Luminance";
+    case ImageChannelOrderRx: return "Rx";
+    case ImageChannelOrderRGx: return "RGx";
+    case ImageChannelOrderRGBx: return "RGBx";
+    case ImageChannelOrderDepth: return "Depth";
+    case ImageChannelOrderDepthStencil: return "DepthStencil";
+    case ImageChannelOrdersRGB: return "sRGB";
+    case ImageChannelOrdersRGBx: return "sRGBx";
+    case ImageChannelOrdersRGBA: return "sRGBA";
+    case ImageChannelOrdersBGRA: return "sBGRA";
+    case ImageChannelOrderABGR: return "ABGR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ImageChannelDataTypeToString(ImageChannelDataType value) {
+    switch (value) {
+    case ImageChannelDataTypeSnormInt8: return "SnormInt8";
+    case ImageChannelDataTypeSnormInt16: return "SnormInt16";
+    case ImageChannelDataTypeUnormInt8: return "UnormInt8";
+    case ImageChannelDataTypeUnormInt16: return "UnormInt16";
+    case ImageChannelDataTypeUnormShort565: return "UnormShort565";
+    case ImageChannelDataTypeUnormShort555: return "UnormShort555";
+    case ImageChannelDataTypeUnormInt101010: return "UnormInt101010";
+    case ImageChannelDataTypeSignedInt8: return "SignedInt8";
+    case ImageChannelDataTypeSignedInt16: return "SignedInt16";
+    case ImageChannelDataTypeSignedInt32: return "SignedInt32";
+    case ImageChannelDataTypeUnsignedInt8: return "UnsignedInt8";
+    case ImageChannelDataTypeUnsignedInt16: return "UnsignedInt16";
+    case ImageChannelDataTypeUnsignedInt32: return "UnsignedInt32";
+    case ImageChannelDataTypeHalfFloat: return "HalfFloat";
+    case ImageChannelDataTypeFloat: return "Float";
+    case ImageChannelDataTypeUnormInt24: return "UnormInt24";
+    case ImageChannelDataTypeUnormInt101010_2: return "UnormInt101010_2";
+    case ImageChannelDataTypeUnsignedIntRaw10EXT: return "UnsignedIntRaw10EXT";
+    case ImageChannelDataTypeUnsignedIntRaw12EXT: return "UnsignedIntRaw12EXT";
+    case ImageChannelDataTypeUnormInt2_101010EXT: return "UnormInt2_101010EXT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPRoundingModeToString(FPRoundingMode value) {
+    switch (value) {
+    case FPRoundingModeRTE: return "RTE";
+    case FPRoundingModeRTZ: return "RTZ";
+    case FPRoundingModeRTP: return "RTP";
+    case FPRoundingModeRTN: return "RTN";
+    default: return "Unknown";
+    }
+}
+
+inline const char* LinkageTypeToString(LinkageType value) {
+    switch (value) {
+    case LinkageTypeExport: return "Export";
+    case LinkageTypeImport: return "Import";
+    case LinkageTypeLinkOnceODR: return "LinkOnceODR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* AccessQualifierToString(AccessQualifier value) {
+    switch (value) {
+    case AccessQualifierReadOnly: return "ReadOnly";
+    case AccessQualifierWriteOnly: return "WriteOnly";
+    case AccessQualifierReadWrite: return "ReadWrite";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FunctionParameterAttributeToString(FunctionParameterAttribute value) {
+    switch (value) {
+    case FunctionParameterAttributeZext: return "Zext";
+    case FunctionParameterAttributeSext: return "Sext";
+    case FunctionParameterAttributeByVal: return "ByVal";
+    case FunctionParameterAttributeSret: return "Sret";
+    case FunctionParameterAttributeNoAlias: return "NoAlias";
+    case FunctionParameterAttributeNoCapture: return "NoCapture";
+    case FunctionParameterAttributeNoWrite: return "NoWrite";
+    case FunctionParameterAttributeNoReadWrite: return "NoReadWrite";
+    case FunctionParameterAttributeRuntimeAlignedINTEL: return "RuntimeAlignedINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* DecorationToString(Decoration value) {
+    switch (value) {
+    case DecorationRelaxedPrecision: return "RelaxedPrecision";
+    case DecorationSpecId: return "SpecId";
+    case DecorationBlock: return "Block";
+    case DecorationBufferBlock: return "BufferBlock";
+    case DecorationRowMajor: return "RowMajor";
+    case DecorationColMajor: return "ColMajor";
+    case DecorationArrayStride: return "ArrayStride";
+    case DecorationMatrixStride: return "MatrixStride";
+    case DecorationGLSLShared: return "GLSLShared";
+    case DecorationGLSLPacked: return "GLSLPacked";
+    case DecorationCPacked: return "CPacked";
+    case DecorationBuiltIn: return "BuiltIn";
+    case DecorationNoPerspective: return "NoPerspective";
+    case DecorationFlat: return "Flat";
+    case DecorationPatch: return "Patch";
+    case DecorationCentroid: return "Centroid";
+    case DecorationSample: return "Sample";
+    case DecorationInvariant: return "Invariant";
+    case DecorationRestrict: return "Restrict";
+    case DecorationAliased: return "Aliased";
+    case DecorationVolatile: return "Volatile";
+    case DecorationConstant: return "Constant";
+    case DecorationCoherent: return "Coherent";
+    case DecorationNonWritable: return "NonWritable";
+    case DecorationNonReadable: return "NonReadable";
+    case DecorationUniform: return "Uniform";
+    case DecorationUniformId: return "UniformId";
+    case DecorationSaturatedConversion: return "SaturatedConversion";
+    case DecorationStream: return "Stream";
+    case DecorationLocation: return "Location";
+    case DecorationComponent: return "Component";
+    case DecorationIndex: return "Index";
+    case DecorationBinding: return "Binding";
+    case DecorationDescriptorSet: return "DescriptorSet";
+    case DecorationOffset: return "Offset";
+    case DecorationXfbBuffer: return "XfbBuffer";
+    case DecorationXfbStride: return "XfbStride";
+    case DecorationFuncParamAttr: return "FuncParamAttr";
+    case DecorationFPRoundingMode: return "FPRoundingMode";
+    case DecorationFPFastMathMode: return "FPFastMathMode";
+    case DecorationLinkageAttributes: return "LinkageAttributes";
+    case DecorationNoContraction: return "NoContraction";
+    case DecorationInputAttachmentIndex: return "InputAttachmentIndex";
+    case DecorationAlignment: return "Alignment";
+    case DecorationMaxByteOffset: return "MaxByteOffset";
+    case DecorationAlignmentId: return "AlignmentId";
+    case DecorationMaxByteOffsetId: return "MaxByteOffsetId";
+    case DecorationNoSignedWrap: return "NoSignedWrap";
+    case DecorationNoUnsignedWrap: return "NoUnsignedWrap";
+    case DecorationWeightTextureQCOM: return "WeightTextureQCOM";
+    case DecorationBlockMatchTextureQCOM: return "BlockMatchTextureQCOM";
+    case DecorationBlockMatchSamplerQCOM: return "BlockMatchSamplerQCOM";
+    case DecorationExplicitInterpAMD: return "ExplicitInterpAMD";
+    case DecorationNodeSharesPayloadLimitsWithAMDX: return "NodeSharesPayloadLimitsWithAMDX";
+    case DecorationNodeMaxPayloadsAMDX: return "NodeMaxPayloadsAMDX";
+    case DecorationTrackFinishWritingAMDX: return "TrackFinishWritingAMDX";
+    case DecorationPayloadNodeNameAMDX: return "PayloadNodeNameAMDX";
+    case DecorationOverrideCoverageNV: return "OverrideCoverageNV";
+    case DecorationPassthroughNV: return "PassthroughNV";
+    case DecorationViewportRelativeNV: return "ViewportRelativeNV";
+    case DecorationSecondaryViewportRelativeNV: return "SecondaryViewportRelativeNV";
+    case DecorationPerPrimitiveEXT: return "PerPrimitiveEXT";
+    case DecorationPerViewNV: return "PerViewNV";
+    case DecorationPerTaskNV: return "PerTaskNV";
+    case DecorationPerVertexKHR: return "PerVertexKHR";
+    case DecorationNonUniform: return "NonUniform";
+    case DecorationRestrictPointer: return "RestrictPointer";
+    case DecorationAliasedPointer: return "AliasedPointer";
+    case DecorationHitObjectShaderRecordBufferNV: return "HitObjectShaderRecordBufferNV";
+    case DecorationBindlessSamplerNV: return "BindlessSamplerNV";
+    case DecorationBindlessImageNV: return "BindlessImageNV";
+    case DecorationBoundSamplerNV: return "BoundSamplerNV";
+    case DecorationBoundImageNV: return "BoundImageNV";
+    case DecorationSIMTCallINTEL: return "SIMTCallINTEL";
+    case DecorationReferencedIndirectlyINTEL: return "ReferencedIndirectlyINTEL";
+    case DecorationClobberINTEL: return "ClobberINTEL";
+    case DecorationSideEffectsINTEL: return "SideEffectsINTEL";
+    case DecorationVectorComputeVariableINTEL: return "VectorComputeVariableINTEL";
+    case DecorationFuncParamIOKindINTEL: return "FuncParamIOKindINTEL";
+    case DecorationVectorComputeFunctionINTEL: return "VectorComputeFunctionINTEL";
+    case DecorationStackCallINTEL: return "StackCallINTEL";
+    case DecorationGlobalVariableOffsetINTEL: return "GlobalVariableOffsetINTEL";
+    case DecorationCounterBuffer: return "CounterBuffer";
+    case DecorationHlslSemanticGOOGLE: return "HlslSemanticGOOGLE";
+    case DecorationUserTypeGOOGLE: return "UserTypeGOOGLE";
+    case DecorationFunctionRoundingModeINTEL: return "FunctionRoundingModeINTEL";
+    case DecorationFunctionDenormModeINTEL: return "FunctionDenormModeINTEL";
+    case DecorationRegisterINTEL: return "RegisterINTEL";
+    case DecorationMemoryINTEL: return "MemoryINTEL";
+    case DecorationNumbanksINTEL: return "NumbanksINTEL";
+    case DecorationBankwidthINTEL: return "BankwidthINTEL";
+    case DecorationMaxPrivateCopiesINTEL: return "MaxPrivateCopiesINTEL";
+    case DecorationSinglepumpINTEL: return "SinglepumpINTEL";
+    case DecorationDoublepumpINTEL: return "DoublepumpINTEL";
+    case DecorationMaxReplicatesINTEL: return "MaxReplicatesINTEL";
+    case DecorationSimpleDualPortINTEL: return "SimpleDualPortINTEL";
+    case DecorationMergeINTEL: return "MergeINTEL";
+    case DecorationBankBitsINTEL: return "BankBitsINTEL";
+    case DecorationForcePow2DepthINTEL: return "ForcePow2DepthINTEL";
+    case DecorationStridesizeINTEL: return "StridesizeINTEL";
+    case DecorationWordsizeINTEL: return "WordsizeINTEL";
+    case DecorationTrueDualPortINTEL: return "TrueDualPortINTEL";
+    case DecorationBurstCoalesceINTEL: return "BurstCoalesceINTEL";
+    case DecorationCacheSizeINTEL: return "CacheSizeINTEL";
+    case DecorationDontStaticallyCoalesceINTEL: return "DontStaticallyCoalesceINTEL";
+    case DecorationPrefetchINTEL: return "PrefetchINTEL";
+    case DecorationStallEnableINTEL: return "StallEnableINTEL";
+    case DecorationFuseLoopsInFunctionINTEL: return "FuseLoopsInFunctionINTEL";
+    case DecorationMathOpDSPModeINTEL: return "MathOpDSPModeINTEL";
+    case DecorationAliasScopeINTEL: return "AliasScopeINTEL";
+    case DecorationNoAliasINTEL: return "NoAliasINTEL";
+    case DecorationInitiationIntervalINTEL: return "InitiationIntervalINTEL";
+    case DecorationMaxConcurrencyINTEL: return "MaxConcurrencyINTEL";
+    case DecorationPipelineEnableINTEL: return "PipelineEnableINTEL";
+    case DecorationBufferLocationINTEL: return "BufferLocationINTEL";
+    case DecorationIOPipeStorageINTEL: return "IOPipeStorageINTEL";
+    case DecorationFunctionFloatingPointModeINTEL: return "FunctionFloatingPointModeINTEL";
+    case DecorationSingleElementVectorINTEL: return "SingleElementVectorINTEL";
+    case DecorationVectorComputeCallableFunctionINTEL: return "VectorComputeCallableFunctionINTEL";
+    case DecorationMediaBlockIOINTEL: return "MediaBlockIOINTEL";
+    case DecorationStallFreeINTEL: return "StallFreeINTEL";
+    case DecorationFPMaxErrorDecorationINTEL: return "FPMaxErrorDecorationINTEL";
+    case DecorationLatencyControlLabelINTEL: return "LatencyControlLabelINTEL";
+    case DecorationLatencyControlConstraintINTEL: return "LatencyControlConstraintINTEL";
+    case DecorationConduitKernelArgumentINTEL: return "ConduitKernelArgumentINTEL";
+    case DecorationRegisterMapKernelArgumentINTEL: return "RegisterMapKernelArgumentINTEL";
+    case DecorationMMHostInterfaceAddressWidthINTEL: return "MMHostInterfaceAddressWidthINTEL";
+    case DecorationMMHostInterfaceDataWidthINTEL: return "MMHostInterfaceDataWidthINTEL";
+    case DecorationMMHostInterfaceLatencyINTEL: return "MMHostInterfaceLatencyINTEL";
+    case DecorationMMHostInterfaceReadWriteModeINTEL: return "MMHostInterfaceReadWriteModeINTEL";
+    case DecorationMMHostInterfaceMaxBurstINTEL: return "MMHostInterfaceMaxBurstINTEL";
+    case DecorationMMHostInterfaceWaitRequestINTEL: return "MMHostInterfaceWaitRequestINTEL";
+    case DecorationStableKernelArgumentINTEL: return "StableKernelArgumentINTEL";
+    case DecorationHostAccessINTEL: return "HostAccessINTEL";
+    case DecorationInitModeINTEL: return "InitModeINTEL";
+    case DecorationImplementInRegisterMapINTEL: return "ImplementInRegisterMapINTEL";
+    case DecorationCacheControlLoadINTEL: return "CacheControlLoadINTEL";
+    case DecorationCacheControlStoreINTEL: return "CacheControlStoreINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* BuiltInToString(BuiltIn value) {
+    switch (value) {
+    case BuiltInPosition: return "Position";
+    case BuiltInPointSize: return "PointSize";
+    case BuiltInClipDistance: return "ClipDistance";
+    case BuiltInCullDistance: return "CullDistance";
+    case BuiltInVertexId: return "VertexId";
+    case BuiltInInstanceId: return "InstanceId";
+    case BuiltInPrimitiveId: return "PrimitiveId";
+    case BuiltInInvocationId: return "InvocationId";
+    case BuiltInLayer: return "Layer";
+    case BuiltInViewportIndex: return "ViewportIndex";
+    case BuiltInTessLevelOuter: return "TessLevelOuter";
+    case BuiltInTessLevelInner: return "TessLevelInner";
+    case BuiltInTessCoord: return "TessCoord";
+    case BuiltInPatchVertices: return "PatchVertices";
+    case BuiltInFragCoord: return "FragCoord";
+    case BuiltInPointCoord: return "PointCoord";
+    case BuiltInFrontFacing: return "FrontFacing";
+    case BuiltInSampleId: return "SampleId";
+    case BuiltInSamplePosition: return "SamplePosition";
+    case BuiltInSampleMask: return "SampleMask";
+    case BuiltInFragDepth: return "FragDepth";
+    case BuiltInHelperInvocation: return "HelperInvocation";
+    case BuiltInNumWorkgroups: return "NumWorkgroups";
+    case BuiltInWorkgroupSize: return "WorkgroupSize";
+    case BuiltInWorkgroupId: return "WorkgroupId";
+    case BuiltInLocalInvocationId: return "LocalInvocationId";
+    case BuiltInGlobalInvocationId: return "GlobalInvocationId";
+    case BuiltInLocalInvocationIndex: return "LocalInvocationIndex";
+    case BuiltInWorkDim: return "WorkDim";
+    case BuiltInGlobalSize: return "GlobalSize";
+    case BuiltInEnqueuedWorkgroupSize: return "EnqueuedWorkgroupSize";
+    case BuiltInGlobalOffset: return "GlobalOffset";
+    case BuiltInGlobalLinearId: return "GlobalLinearId";
+    case BuiltInSubgroupSize: return "SubgroupSize";
+    case BuiltInSubgroupMaxSize: return "SubgroupMaxSize";
+    case BuiltInNumSubgroups: return "NumSubgroups";
+    case BuiltInNumEnqueuedSubgroups: return "NumEnqueuedSubgroups";
+    case BuiltInSubgroupId: return "SubgroupId";
+    case BuiltInSubgroupLocalInvocationId: return "SubgroupLocalInvocationId";
+    case BuiltInVertexIndex: return "VertexIndex";
+    case BuiltInInstanceIndex: return "InstanceIndex";
+    case BuiltInCoreIDARM: return "CoreIDARM";
+    case BuiltInCoreCountARM: return "CoreCountARM";
+    case BuiltInCoreMaxIDARM: return "CoreMaxIDARM";
+    case BuiltInWarpIDARM: return "WarpIDARM";
+    case BuiltInWarpMaxIDARM: return "WarpMaxIDARM";
+    case BuiltInSubgroupEqMask: return "SubgroupEqMask";
+    case BuiltInSubgroupGeMask: return "SubgroupGeMask";
+    case BuiltInSubgroupGtMask: return "SubgroupGtMask";
+    case BuiltInSubgroupLeMask: return "SubgroupLeMask";
+    case BuiltInSubgroupLtMask: return "SubgroupLtMask";
+    case BuiltInBaseVertex: return "BaseVertex";
+    case BuiltInBaseInstance: return "BaseInstance";
+    case BuiltInDrawIndex: return "DrawIndex";
+    case BuiltInPrimitiveShadingRateKHR: return "PrimitiveShadingRateKHR";
+    case BuiltInDeviceIndex: return "DeviceIndex";
+    case BuiltInViewIndex: return "ViewIndex";
+    case BuiltInShadingRateKHR: return "ShadingRateKHR";
+    case BuiltInBaryCoordNoPerspAMD: return "BaryCoordNoPerspAMD";
+    case BuiltInBaryCoordNoPerspCentroidAMD: return "BaryCoordNoPerspCentroidAMD";
+    case BuiltInBaryCoordNoPerspSampleAMD: return "BaryCoordNoPerspSampleAMD";
+    case BuiltInBaryCoordSmoothAMD: return "BaryCoordSmoothAMD";
+    case BuiltInBaryCoordSmoothCentroidAMD: return "BaryCoordSmoothCentroidAMD";
+    case BuiltInBaryCoordSmoothSampleAMD: return "BaryCoordSmoothSampleAMD";
+    case BuiltInBaryCoordPullModelAMD: return "BaryCoordPullModelAMD";
+    case BuiltInFragStencilRefEXT: return "FragStencilRefEXT";
+    case BuiltInCoalescedInputCountAMDX: return "CoalescedInputCountAMDX";
+    case BuiltInShaderIndexAMDX: return "ShaderIndexAMDX";
+    case BuiltInViewportMaskNV: return "ViewportMaskNV";
+    case BuiltInSecondaryPositionNV: return "SecondaryPositionNV";
+    case BuiltInSecondaryViewportMaskNV: return "SecondaryViewportMaskNV";
+    case BuiltInPositionPerViewNV: return "PositionPerViewNV";
+    case BuiltInViewportMaskPerViewNV: return "ViewportMaskPerViewNV";
+    case BuiltInFullyCoveredEXT: return "FullyCoveredEXT";
+    case BuiltInTaskCountNV: return "TaskCountNV";
+    case BuiltInPrimitiveCountNV: return "PrimitiveCountNV";
+    case BuiltInPrimitiveIndicesNV: return "PrimitiveIndicesNV";
+    case BuiltInClipDistancePerViewNV: return "ClipDistancePerViewNV";
+    case BuiltInCullDistancePerViewNV: return "CullDistancePerViewNV";
+    case BuiltInLayerPerViewNV: return "LayerPerViewNV";
+    case BuiltInMeshViewCountNV: return "MeshViewCountNV";
+    case BuiltInMeshViewIndicesNV: return "MeshViewIndicesNV";
+    case BuiltInBaryCoordKHR: return "BaryCoordKHR";
+    case BuiltInBaryCoordNoPerspKHR: return "BaryCoordNoPerspKHR";
+    case BuiltInFragSizeEXT: return "FragSizeEXT";
+    case BuiltInFragInvocationCountEXT: return "FragInvocationCountEXT";
+    case BuiltInPrimitivePointIndicesEXT: return "PrimitivePointIndicesEXT";
+    case BuiltInPrimitiveLineIndicesEXT: return "PrimitiveLineIndicesEXT";
+    case BuiltInPrimitiveTriangleIndicesEXT: return "PrimitiveTriangleIndicesEXT";
+    case BuiltInCullPrimitiveEXT: return "CullPrimitiveEXT";
+    case BuiltInLaunchIdKHR: return "LaunchIdKHR";
+    case BuiltInLaunchSizeKHR: return "LaunchSizeKHR";
+    case BuiltInWorldRayOriginKHR: return "WorldRayOriginKHR";
+    case BuiltInWorldRayDirectionKHR: return "WorldRayDirectionKHR";
+    case BuiltInObjectRayOriginKHR: return "ObjectRayOriginKHR";
+    case BuiltInObjectRayDirectionKHR: return "ObjectRayDirectionKHR";
+    case BuiltInRayTminKHR: return "RayTminKHR";
+    case BuiltInRayTmaxKHR: return "RayTmaxKHR";
+    case BuiltInInstanceCustomIndexKHR: return "InstanceCustomIndexKHR";
+    case BuiltInObjectToWorldKHR: return "ObjectToWorldKHR";
+    case BuiltInWorldToObjectKHR: return "WorldToObjectKHR";
+    case BuiltInHitTNV: return "HitTNV";
+    case BuiltInHitKindKHR: return "HitKindKHR";
+    case BuiltInCurrentRayTimeNV: return "CurrentRayTimeNV";
+    case BuiltInHitTriangleVertexPositionsKHR: return "HitTriangleVertexPositionsKHR";
+    case BuiltInHitMicroTriangleVertexPositionsNV: return "HitMicroTriangleVertexPositionsNV";
+    case BuiltInHitMicroTriangleVertexBarycentricsNV: return "HitMicroTriangleVertexBarycentricsNV";
+    case BuiltInIncomingRayFlagsKHR: return "IncomingRayFlagsKHR";
+    case BuiltInRayGeometryIndexKHR: return "RayGeometryIndexKHR";
+    case BuiltInWarpsPerSMNV: return "WarpsPerSMNV";
+    case BuiltInSMCountNV: return "SMCountNV";
+    case BuiltInWarpIDNV: return "WarpIDNV";
+    case BuiltInSMIDNV: return "SMIDNV";
+    case BuiltInHitKindFrontFacingMicroTriangleNV: return "HitKindFrontFacingMicroTriangleNV";
+    case BuiltInHitKindBackFacingMicroTriangleNV: return "HitKindBackFacingMicroTriangleNV";
+    case BuiltInCullMaskKHR: return "CullMaskKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ScopeToString(Scope value) {
+    switch (value) {
+    case ScopeCrossDevice: return "CrossDevice";
+    case ScopeDevice: return "Device";
+    case ScopeWorkgroup: return "Workgroup";
+    case ScopeSubgroup: return "Subgroup";
+    case ScopeInvocation: return "Invocation";
+    case ScopeQueueFamily: return "QueueFamily";
+    case ScopeShaderCallKHR: return "ShaderCallKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* GroupOperationToString(GroupOperation value) {
+    switch (value) {
+    case GroupOperationReduce: return "Reduce";
+    case GroupOperationInclusiveScan: return "InclusiveScan";
+    case GroupOperationExclusiveScan: return "ExclusiveScan";
+    case GroupOperationClusteredReduce: return "ClusteredReduce";
+    case GroupOperationPartitionedReduceNV: return "PartitionedReduceNV";
+    case GroupOperationPartitionedInclusiveScanNV: return "PartitionedInclusiveScanNV";
+    case GroupOperationPartitionedExclusiveScanNV: return "PartitionedExclusiveScanNV";
+    default: return "Unknown";
+    }
+}
+
+inline const char* KernelEnqueueFlagsToString(KernelEnqueueFlags value) {
+    switch (value) {
+    case KernelEnqueueFlagsNoWait: return "NoWait";
+    case KernelEnqueueFlagsWaitKernel: return "WaitKernel";
+    case KernelEnqueueFlagsWaitWorkGroup: return "WaitWorkGroup";
+    default: return "Unknown";
+    }
+}
+
+inline const char* CapabilityToString(Capability value) {
+    switch (value) {
+    case CapabilityMatrix: return "Matrix";
+    case CapabilityShader: return "Shader";
+    case CapabilityGeometry: return "Geometry";
+    case CapabilityTessellation: return "Tessellation";
+    case CapabilityAddresses: return "Addresses";
+    case CapabilityLinkage: return "Linkage";
+    case CapabilityKernel: return "Kernel";
+    case CapabilityVector16: return "Vector16";
+    case CapabilityFloat16Buffer: return "Float16Buffer";
+    case CapabilityFloat16: return "Float16";
+    case CapabilityFloat64: return "Float64";
+    case CapabilityInt64: return "Int64";
+    case CapabilityInt64Atomics: return "Int64Atomics";
+    case CapabilityImageBasic: return "ImageBasic";
+    case CapabilityImageReadWrite: return "ImageReadWrite";
+    case CapabilityImageMipmap: return "ImageMipmap";
+    case CapabilityPipes: return "Pipes";
+    case CapabilityGroups: return "Groups";
+    case CapabilityDeviceEnqueue: return "DeviceEnqueue";
+    case CapabilityLiteralSampler: return "LiteralSampler";
+    case CapabilityAtomicStorage: return "AtomicStorage";
+    case CapabilityInt16: return "Int16";
+    case CapabilityTessellationPointSize: return "TessellationPointSize";
+    case CapabilityGeometryPointSize: return "GeometryPointSize";
+    case CapabilityImageGatherExtended: return "ImageGatherExtended";
+    case CapabilityStorageImageMultisample: return "StorageImageMultisample";
+    case CapabilityUniformBufferArrayDynamicIndexing: return "UniformBufferArrayDynamicIndexing";
+    case CapabilitySampledImageArrayDynamicIndexing: return "SampledImageArrayDynamicIndexing";
+    case CapabilityStorageBufferArrayDynamicIndexing: return "StorageBufferArrayDynamicIndexing";
+    case CapabilityStorageImageArrayDynamicIndexing: return "StorageImageArrayDynamicIndexing";
+    case CapabilityClipDistance: return "ClipDistance";
+    case CapabilityCullDistance: return "CullDistance";
+    case CapabilityImageCubeArray: return "ImageCubeArray";
+    case CapabilitySampleRateShading: return "SampleRateShading";
+    case CapabilityImageRect: return "ImageRect";
+    case CapabilitySampledRect: return "SampledRect";
+    case CapabilityGenericPointer: return "GenericPointer";
+    case CapabilityInt8: return "Int8";
+    case CapabilityInputAttachment: return "InputAttachment";
+    case CapabilitySparseResidency: return "SparseResidency";
+    case CapabilityMinLod: return "MinLod";
+    case CapabilitySampled1D: return "Sampled1D";
+    case CapabilityImage1D: return "Image1D";
+    case CapabilitySampledCubeArray: return "SampledCubeArray";
+    case CapabilitySampledBuffer: return "SampledBuffer";
+    case CapabilityImageBuffer: return "ImageBuffer";
+    case CapabilityImageMSArray: return "ImageMSArray";
+    case CapabilityStorageImageExtendedFormats: return "StorageImageExtendedFormats";
+    case CapabilityImageQuery: return "ImageQuery";
+    case CapabilityDerivativeControl: return "DerivativeControl";
+    case CapabilityInterpolationFunction: return "InterpolationFunction";
+    case CapabilityTransformFeedback: return "TransformFeedback";
+    case CapabilityGeometryStreams: return "GeometryStreams";
+    case CapabilityStorageImageReadWithoutFormat: return "StorageImageReadWithoutFormat";
+    case CapabilityStorageImageWriteWithoutFormat: return "StorageImageWriteWithoutFormat";
+    case CapabilityMultiViewport: return "MultiViewport";
+    case CapabilitySubgroupDispatch: return "SubgroupDispatch";
+    case CapabilityNamedBarrier: return "NamedBarrier";
+    case CapabilityPipeStorage: return "PipeStorage";
+    case CapabilityGroupNonUniform: return "GroupNonUniform";
+    case CapabilityGroupNonUniformVote: return "GroupNonUniformVote";
+    case CapabilityGroupNonUniformArithmetic: return "GroupNonUniformArithmetic";
+    case CapabilityGroupNonUniformBallot: return "GroupNonUniformBallot";
+    case CapabilityGroupNonUniformShuffle: return "GroupNonUniformShuffle";
+    case CapabilityGroupNonUniformShuffleRelative: return "GroupNonUniformShuffleRelative";
+    case CapabilityGroupNonUniformClustered: return "GroupNonUniformClustered";
+    case CapabilityGroupNonUniformQuad: return "GroupNonUniformQuad";
+    case CapabilityShaderLayer: return "ShaderLayer";
+    case CapabilityShaderViewportIndex: return "ShaderViewportIndex";
+    case CapabilityUniformDecoration: return "UniformDecoration";
+    case CapabilityCoreBuiltinsARM: return "CoreBuiltinsARM";
+    case CapabilityTileImageColorReadAccessEXT: return "TileImageColorReadAccessEXT";
+    case CapabilityTileImageDepthReadAccessEXT: return "TileImageDepthReadAccessEXT";
+    case CapabilityTileImageStencilReadAccessEXT: return "TileImageStencilReadAccessEXT";
+    case CapabilityCooperativeMatrixLayoutsARM: return "CooperativeMatrixLayoutsARM";
+    case CapabilityFragmentShadingRateKHR: return "FragmentShadingRateKHR";
+    case CapabilitySubgroupBallotKHR: return "SubgroupBallotKHR";
+    case CapabilityDrawParameters: return "DrawParameters";
+    case CapabilityWorkgroupMemoryExplicitLayoutKHR: return "WorkgroupMemoryExplicitLayoutKHR";
+    case CapabilityWorkgroupMemoryExplicitLayout8BitAccessKHR: return "WorkgroupMemoryExplicitLayout8BitAccessKHR";
+    case CapabilityWorkgroupMemoryExplicitLayout16BitAccessKHR: return "WorkgroupMemoryExplicitLayout16BitAccessKHR";
+    case CapabilitySubgroupVoteKHR: return "SubgroupVoteKHR";
+    case CapabilityStorageBuffer16BitAccess: return "StorageBuffer16BitAccess";
+    case CapabilityStorageUniform16: return "StorageUniform16";
+    case CapabilityStoragePushConstant16: return "StoragePushConstant16";
+    case CapabilityStorageInputOutput16: return "StorageInputOutput16";
+    case CapabilityDeviceGroup: return "DeviceGroup";
+    case CapabilityMultiView: return "MultiView";
+    case CapabilityVariablePointersStorageBuffer: return "VariablePointersStorageBuffer";
+    case CapabilityVariablePointers: return "VariablePointers";
+    case CapabilityAtomicStorageOps: return "AtomicStorageOps";
+    case CapabilitySampleMaskPostDepthCoverage: return "SampleMaskPostDepthCoverage";
+    case CapabilityStorageBuffer8BitAccess: return "StorageBuffer8BitAccess";
+    case CapabilityUniformAndStorageBuffer8BitAccess: return "UniformAndStorageBuffer8BitAccess";
+    case CapabilityStoragePushConstant8: return "StoragePushConstant8";
+    case CapabilityDenormPreserve: return "DenormPreserve";
+    case CapabilityDenormFlushToZero: return "DenormFlushToZero";
+    case CapabilitySignedZeroInfNanPreserve: return "SignedZeroInfNanPreserve";
+    case CapabilityRoundingModeRTE: return "RoundingModeRTE";
+    case CapabilityRoundingModeRTZ: return "RoundingModeRTZ";
+    case CapabilityRayQueryProvisionalKHR: return "RayQueryProvisionalKHR";
+    case CapabilityRayQueryKHR: return "RayQueryKHR";
+    case CapabilityUntypedPointersKHR: return "UntypedPointersKHR";
+    case CapabilityRayTraversalPrimitiveCullingKHR: return "RayTraversalPrimitiveCullingKHR";
+    case CapabilityRayTracingKHR: return "RayTracingKHR";
+    case CapabilityTextureSampleWeightedQCOM: return "TextureSampleWeightedQCOM";
+    case CapabilityTextureBoxFilterQCOM: return "TextureBoxFilterQCOM";
+    case CapabilityTextureBlockMatchQCOM: return "TextureBlockMatchQCOM";
+    case CapabilityTextureBlockMatch2QCOM: return "TextureBlockMatch2QCOM";
+    case CapabilityFloat16ImageAMD: return "Float16ImageAMD";
+    case CapabilityImageGatherBiasLodAMD: return "ImageGatherBiasLodAMD";
+    case CapabilityFragmentMaskAMD: return "FragmentMaskAMD";
+    case CapabilityStencilExportEXT: return "StencilExportEXT";
+    case CapabilityImageReadWriteLodAMD: return "ImageReadWriteLodAMD";
+    case CapabilityInt64ImageEXT: return "Int64ImageEXT";
+    case CapabilityShaderClockKHR: return "ShaderClockKHR";
+    case CapabilityShaderEnqueueAMDX: return "ShaderEnqueueAMDX";
+    case CapabilityQuadControlKHR: return "QuadControlKHR";
+    case CapabilitySampleMaskOverrideCoverageNV: return "SampleMaskOverrideCoverageNV";
+    case CapabilityGeometryShaderPassthroughNV: return "GeometryShaderPassthroughNV";
+    case CapabilityShaderViewportIndexLayerEXT: return "ShaderViewportIndexLayerEXT";
+    case CapabilityShaderViewportMaskNV: return "ShaderViewportMaskNV";
+    case CapabilityShaderStereoViewNV: return "ShaderStereoViewNV";
+    case CapabilityPerViewAttributesNV: return "PerViewAttributesNV";
+    case CapabilityFragmentFullyCoveredEXT: return "FragmentFullyCoveredEXT";
+    case CapabilityMeshShadingNV: return "MeshShadingNV";
+    case CapabilityImageFootprintNV: return "ImageFootprintNV";
+    case CapabilityMeshShadingEXT: return "MeshShadingEXT";
+    case CapabilityFragmentBarycentricKHR: return "FragmentBarycentricKHR";
+    case CapabilityComputeDerivativeGroupQuadsKHR: return "ComputeDerivativeGroupQuadsKHR";
+    case CapabilityFragmentDensityEXT: return "FragmentDensityEXT";
+    case CapabilityGroupNonUniformPartitionedNV: return "GroupNonUniformPartitionedNV";
+    case CapabilityShaderNonUniform: return "ShaderNonUniform";
+    case CapabilityRuntimeDescriptorArray: return "RuntimeDescriptorArray";
+    case CapabilityInputAttachmentArrayDynamicIndexing: return "InputAttachmentArrayDynamicIndexing";
+    case CapabilityUniformTexelBufferArrayDynamicIndexing: return "UniformTexelBufferArrayDynamicIndexing";
+    case CapabilityStorageTexelBufferArrayDynamicIndexing: return "StorageTexelBufferArrayDynamicIndexing";
+    case CapabilityUniformBufferArrayNonUniformIndexing: return "UniformBufferArrayNonUniformIndexing";
+    case CapabilitySampledImageArrayNonUniformIndexing: return "SampledImageArrayNonUniformIndexing";
+    case CapabilityStorageBufferArrayNonUniformIndexing: return "StorageBufferArrayNonUniformIndexing";
+    case CapabilityStorageImageArrayNonUniformIndexing: return "StorageImageArrayNonUniformIndexing";
+    case CapabilityInputAttachmentArrayNonUniformIndexing: return "InputAttachmentArrayNonUniformIndexing";
+    case CapabilityUniformTexelBufferArrayNonUniformIndexing: return "UniformTexelBufferArrayNonUniformIndexing";
+    case CapabilityStorageTexelBufferArrayNonUniformIndexing: return "StorageTexelBufferArrayNonUniformIndexing";
+    case CapabilityRayTracingPositionFetchKHR: return "RayTracingPositionFetchKHR";
+    case CapabilityRayTracingNV: return "RayTracingNV";
+    case CapabilityRayTracingMotionBlurNV: return "RayTracingMotionBlurNV";
+    case CapabilityVulkanMemoryModel: return "VulkanMemoryModel";
+    case CapabilityVulkanMemoryModelDeviceScope: return "VulkanMemoryModelDeviceScope";
+    case CapabilityPhysicalStorageBufferAddresses: return "PhysicalStorageBufferAddresses";
+    case CapabilityComputeDerivativeGroupLinearKHR: return "ComputeDerivativeGroupLinearKHR";
+    case CapabilityRayTracingProvisionalKHR: return "RayTracingProvisionalKHR";
+    case CapabilityCooperativeMatrixNV: return "CooperativeMatrixNV";
+    case CapabilityFragmentShaderSampleInterlockEXT: return "FragmentShaderSampleInterlockEXT";
+    case CapabilityFragmentShaderShadingRateInterlockEXT: return "FragmentShaderShadingRateInterlockEXT";
+    case CapabilityShaderSMBuiltinsNV: return "ShaderSMBuiltinsNV";
+    case CapabilityFragmentShaderPixelInterlockEXT: return "FragmentShaderPixelInterlockEXT";
+    case CapabilityDemoteToHelperInvocation: return "DemoteToHelperInvocation";
+    case CapabilityDisplacementMicromapNV: return "DisplacementMicromapNV";
+    case CapabilityRayTracingOpacityMicromapEXT: return "RayTracingOpacityMicromapEXT";
+    case CapabilityShaderInvocationReorderNV: return "ShaderInvocationReorderNV";
+    case CapabilityBindlessTextureNV: return "BindlessTextureNV";
+    case CapabilityRayQueryPositionFetchKHR: return "RayQueryPositionFetchKHR";
+    case CapabilityAtomicFloat16VectorNV: return "AtomicFloat16VectorNV";
+    case CapabilityRayTracingDisplacementMicromapNV: return "RayTracingDisplacementMicromapNV";
+    case CapabilityRawAccessChainsNV: return "RawAccessChainsNV";
+    case CapabilitySubgroupShuffleINTEL: return "SubgroupShuffleINTEL";
+    case CapabilitySubgroupBufferBlockIOINTEL: return "SubgroupBufferBlockIOINTEL";
+    case CapabilitySubgroupImageBlockIOINTEL: return "SubgroupImageBlockIOINTEL";
+    case CapabilitySubgroupImageMediaBlockIOINTEL: return "SubgroupImageMediaBlockIOINTEL";
+    case CapabilityRoundToInfinityINTEL: return "RoundToInfinityINTEL";
+    case CapabilityFloatingPointModeINTEL: return "FloatingPointModeINTEL";
+    case CapabilityIntegerFunctions2INTEL: return "IntegerFunctions2INTEL";
+    case CapabilityFunctionPointersINTEL: return "FunctionPointersINTEL";
+    case CapabilityIndirectReferencesINTEL: return "IndirectReferencesINTEL";
+    case CapabilityAsmINTEL: return "AsmINTEL";
+    case CapabilityAtomicFloat32MinMaxEXT: return "AtomicFloat32MinMaxEXT";
+    case CapabilityAtomicFloat64MinMaxEXT: return "AtomicFloat64MinMaxEXT";
+    case CapabilityAtomicFloat16MinMaxEXT: return "AtomicFloat16MinMaxEXT";
+    case CapabilityVectorComputeINTEL: return "VectorComputeINTEL";
+    case CapabilityVectorAnyINTEL: return "VectorAnyINTEL";
+    case CapabilityExpectAssumeKHR: return "ExpectAssumeKHR";
+    case CapabilitySubgroupAvcMotionEstimationINTEL: return "SubgroupAvcMotionEstimationINTEL";
+    case CapabilitySubgroupAvcMotionEstimationIntraINTEL: return "SubgroupAvcMotionEstimationIntraINTEL";
+    case CapabilitySubgroupAvcMotionEstimationChromaINTEL: return "SubgroupAvcMotionEstimationChromaINTEL";
+    case CapabilityVariableLengthArrayINTEL: return "VariableLengthArrayINTEL";
+    case CapabilityFunctionFloatControlINTEL: return "FunctionFloatControlINTEL";
+    case CapabilityFPGAMemoryAttributesINTEL: return "FPGAMemoryAttributesINTEL";
+    case CapabilityFPFastMathModeINTEL: return "FPFastMathModeINTEL";
+    case CapabilityArbitraryPrecisionIntegersINTEL: return "ArbitraryPrecisionIntegersINTEL";
+    case CapabilityArbitraryPrecisionFloatingPointINTEL: return "ArbitraryPrecisionFloatingPointINTEL";
+    case CapabilityUnstructuredLoopControlsINTEL: return "UnstructuredLoopControlsINTEL";
+    case CapabilityFPGALoopControlsINTEL: return "FPGALoopControlsINTEL";
+    case CapabilityKernelAttributesINTEL: return "KernelAttributesINTEL";
+    case CapabilityFPGAKernelAttributesINTEL: return "FPGAKernelAttributesINTEL";
+    case CapabilityFPGAMemoryAccessesINTEL: return "FPGAMemoryAccessesINTEL";
+    case CapabilityFPGAClusterAttributesINTEL: return "FPGAClusterAttributesINTEL";
+    case CapabilityLoopFuseINTEL: return "LoopFuseINTEL";
+    case CapabilityFPGADSPControlINTEL: return "FPGADSPControlINTEL";
+    case CapabilityMemoryAccessAliasingINTEL: return "MemoryAccessAliasingINTEL";
+    case CapabilityFPGAInvocationPipeliningAttributesINTEL: return "FPGAInvocationPipeliningAttributesINTEL";
+    case CapabilityFPGABufferLocationINTEL: return "FPGABufferLocationINTEL";
+    case CapabilityArbitraryPrecisionFixedPointINTEL: return "ArbitraryPrecisionFixedPointINTEL";
+    case CapabilityUSMStorageClassesINTEL: return "USMStorageClassesINTEL";
+    case CapabilityRuntimeAlignedAttributeINTEL: return "RuntimeAlignedAttributeINTEL";
+    case CapabilityIOPipesINTEL: return "IOPipesINTEL";
+    case CapabilityBlockingPipesINTEL: return "BlockingPipesINTEL";
+    case CapabilityFPGARegINTEL: return "FPGARegINTEL";
+    case CapabilityDotProductInputAll: return "DotProductInputAll";
+    case CapabilityDotProductInput4x8Bit: return "DotProductInput4x8Bit";
+    case CapabilityDotProductInput4x8BitPacked: return "DotProductInput4x8BitPacked";
+    case CapabilityDotProduct: return "DotProduct";
+    case CapabilityRayCullMaskKHR: return "RayCullMaskKHR";
+    case CapabilityCooperativeMatrixKHR: return "CooperativeMatrixKHR";
+    case CapabilityReplicatedCompositesEXT: return "ReplicatedCompositesEXT";
+    case CapabilityBitInstructions: return "BitInstructions";
+    case CapabilityGroupNonUniformRotateKHR: return "GroupNonUniformRotateKHR";
+    case CapabilityFloatControls2: return "FloatControls2";
+    case CapabilityAtomicFloat32AddEXT: return "AtomicFloat32AddEXT";
+    case CapabilityAtomicFloat64AddEXT: return "AtomicFloat64AddEXT";
+    case CapabilityLongCompositesINTEL: return "LongCompositesINTEL";
+    case CapabilityOptNoneINTEL: return "OptNoneINTEL";
+    case CapabilityAtomicFloat16AddEXT: return "AtomicFloat16AddEXT";
+    case CapabilityDebugInfoModuleINTEL: return "DebugInfoModuleINTEL";
+    case CapabilityBFloat16ConversionINTEL: return "BFloat16ConversionINTEL";
+    case CapabilitySplitBarrierINTEL: return "SplitBarrierINTEL";
+    case CapabilityFPGAClusterAttributesV2INTEL: return "FPGAClusterAttributesV2INTEL";
+    case CapabilityFPGAKernelAttributesv2INTEL: return "FPGAKernelAttributesv2INTEL";
+    case CapabilityFPMaxErrorINTEL: return "FPMaxErrorINTEL";
+    case CapabilityFPGALatencyControlINTEL: return "FPGALatencyControlINTEL";
+    case CapabilityFPGAArgumentInterfacesINTEL: return "FPGAArgumentInterfacesINTEL";
+    case CapabilityGlobalVariableHostAccessINTEL: return "GlobalVariableHostAccessINTEL";
+    case CapabilityGlobalVariableFPGADecorationsINTEL: return "GlobalVariableFPGADecorationsINTEL";
+    case CapabilitySubgroupBufferPrefetchINTEL: return "SubgroupBufferPrefetchINTEL";
+    case CapabilityGroupUniformArithmeticKHR: return "GroupUniformArithmeticKHR";
+    case CapabilityMaskedGatherScatterINTEL: return "MaskedGatherScatterINTEL";
+    case CapabilityCacheControlsINTEL: return "CacheControlsINTEL";
+    case CapabilityRegisterLimitsINTEL: return "RegisterLimitsINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* RayQueryIntersectionToString(RayQueryIntersection value) {
+    switch (value) {
+    case RayQueryIntersectionRayQueryCandidateIntersectionKHR: return "RayQueryCandidateIntersectionKHR";
+    case RayQueryIntersectionRayQueryCommittedIntersectionKHR: return "RayQueryCommittedIntersectionKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* RayQueryCommittedIntersectionTypeToString(RayQueryCommittedIntersectionType value) {
+    switch (value) {
+    case RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionNoneKHR: return "RayQueryCommittedIntersectionNoneKHR";
+    case RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR: return "RayQueryCommittedIntersectionTriangleKHR";
+    case RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionGeneratedKHR: return "RayQueryCommittedIntersectionGeneratedKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* RayQueryCandidateIntersectionTypeToString(RayQueryCandidateIntersectionType value) {
+    switch (value) {
+    case RayQueryCandidateIntersectionTypeRayQueryCandidateIntersectionTriangleKHR: return "RayQueryCandidateIntersectionTriangleKHR";
+    case RayQueryCandidateIntersectionTypeRayQueryCandidateIntersectionAABBKHR: return "RayQueryCandidateIntersectionAABBKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPDenormModeToString(FPDenormMode value) {
+    switch (value) {
+    case FPDenormModePreserve: return "Preserve";
+    case FPDenormModeFlushToZero: return "FlushToZero";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPOperationModeToString(FPOperationMode value) {
+    switch (value) {
+    case FPOperationModeIEEE: return "IEEE";
+    case FPOperationModeALT: return "ALT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* QuantizationModesToString(QuantizationModes value) {
+    switch (value) {
+    case QuantizationModesTRN: return "TRN";
+    case QuantizationModesTRN_ZERO: return "TRN_ZERO";
+    case QuantizationModesRND: return "RND";
+    case QuantizationModesRND_ZERO: return "RND_ZERO";
+    case QuantizationModesRND_INF: return "RND_INF";
+    case QuantizationModesRND_MIN_INF: return "RND_MIN_INF";
+    case QuantizationModesRND_CONV: return "RND_CONV";
+    case QuantizationModesRND_CONV_ODD: return "RND_CONV_ODD";
+    default: return "Unknown";
+    }
+}
+
+inline const char* OverflowModesToString(OverflowModes value) {
+    switch (value) {
+    case OverflowModesWRAP: return "WRAP";
+    case OverflowModesSAT: return "SAT";
+    case OverflowModesSAT_ZERO: return "SAT_ZERO";
+    case OverflowModesSAT_SYM: return "SAT_SYM";
+    default: return "Unknown";
+    }
+}
+
+inline const char* PackedVectorFormatToString(PackedVectorFormat value) {
+    switch (value) {
+    case PackedVectorFormatPackedVectorFormat4x8Bit: return "PackedVectorFormat4x8Bit";
+    default: return "Unknown";
+    }
+}
+
+inline const char* CooperativeMatrixLayoutToString(CooperativeMatrixLayout value) {
+    switch (value) {
+    case CooperativeMatrixLayoutRowMajorKHR: return "RowMajorKHR";
+    case CooperativeMatrixLayoutColumnMajorKHR: return "ColumnMajorKHR";
+    case CooperativeMatrixLayoutRowBlockedInterleavedARM: return "RowBlockedInterleavedARM";
+    case CooperativeMatrixLayoutColumnBlockedInterleavedARM: return "ColumnBlockedInterleavedARM";
+    default: return "Unknown";
+    }
+}
+
+inline const char* CooperativeMatrixUseToString(CooperativeMatrixUse value) {
+    switch (value) {
+    case CooperativeMatrixUseMatrixAKHR: return "MatrixAKHR";
+    case CooperativeMatrixUseMatrixBKHR: return "MatrixBKHR";
+    case CooperativeMatrixUseMatrixAccumulatorKHR: return "MatrixAccumulatorKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* InitializationModeQualifierToString(InitializationModeQualifier value) {
+    switch (value) {
+    case InitializationModeQualifierInitOnDeviceReprogramINTEL: return "InitOnDeviceReprogramINTEL";
+    case InitializationModeQualifierInitOnDeviceResetINTEL: return "InitOnDeviceResetINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* HostAccessQualifierToString(HostAccessQualifier value) {
+    switch (value) {
+    case HostAccessQualifierNoneINTEL: return "NoneINTEL";
+    case HostAccessQualifierReadINTEL: return "ReadINTEL";
+    case HostAccessQualifierWriteINTEL: return "WriteINTEL";
+    case HostAccessQualifierReadWriteINTEL: return "ReadWriteINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* LoadCacheControlToString(LoadCacheControl value) {
+    switch (value) {
+    case LoadCacheControlUncachedINTEL: return "UncachedINTEL";
+    case LoadCacheControlCachedINTEL: return "CachedINTEL";
+    case LoadCacheControlStreamingINTEL: return "StreamingINTEL";
+    case LoadCacheControlInvalidateAfterReadINTEL: return "InvalidateAfterReadINTEL";
+    case LoadCacheControlConstCachedINTEL: return "ConstCachedINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* StoreCacheControlToString(StoreCacheControl value) {
+    switch (value) {
+    case StoreCacheControlUncachedINTEL: return "UncachedINTEL";
+    case StoreCacheControlWriteThroughINTEL: return "WriteThroughINTEL";
+    case StoreCacheControlWriteBackINTEL: return "WriteBackINTEL";
+    case StoreCacheControlStreamingINTEL: return "StreamingINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* NamedMaximumNumberOfRegistersToString(NamedMaximumNumberOfRegisters value) {
+    switch (value) {
+    case NamedMaximumNumberOfRegistersAutoINTEL: return "AutoINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPEncodingToString(FPEncoding value) {
+    switch (value) {
+    default: return "Unknown";
+    }
+}
+
+inline const char* OpToString(Op value) {
+    switch (value) {
+    case OpNop: return "OpNop";
+    case OpUndef: return "OpUndef";
+    case OpSourceContinued: return "OpSourceContinued";
+    case OpSource: return "OpSource";
+    case OpSourceExtension: return "OpSourceExtension";
+    case OpName: return "OpName";
+    case OpMemberName: return "OpMemberName";
+    case OpString: return "OpString";
+    case OpLine: return "OpLine";
+    case OpExtension: return "OpExtension";
+    case OpExtInstImport: return "OpExtInstImport";
+    case OpExtInst: return "OpExtInst";
+    case OpMemoryModel: return "OpMemoryModel";
+    case OpEntryPoint: return "OpEntryPoint";
+    case OpExecutionMode: return "OpExecutionMode";
+    case OpCapability: return "OpCapability";
+    case OpTypeVoid: return "OpTypeVoid";
+    case OpTypeBool: return "OpTypeBool";
+    case OpTypeInt: return "OpTypeInt";
+    case OpTypeFloat: return "OpTypeFloat";
+    case OpTypeVector: return "OpTypeVector";
+    case OpTypeMatrix: return "OpTypeMatrix";
+    case OpTypeImage: return "OpTypeImage";
+    case OpTypeSampler: return "OpTypeSampler";
+    case OpTypeSampledImage: return "OpTypeSampledImage";
+    case OpTypeArray: return "OpTypeArray";
+    case OpTypeRuntimeArray: return "OpTypeRuntimeArray";
+    case OpTypeStruct: return "OpTypeStruct";
+    case OpTypeOpaque: return "OpTypeOpaque";
+    case OpTypePointer: return "OpTypePointer";
+    case OpTypeFunction: return "OpTypeFunction";
+    case OpTypeEvent: return "OpTypeEvent";
+    case OpTypeDeviceEvent: return "OpTypeDeviceEvent";
+    case OpTypeReserveId: return "OpTypeReserveId";
+    case OpTypeQueue: return "OpTypeQueue";
+    case OpTypePipe: return "OpTypePipe";
+    case OpTypeForwardPointer: return "OpTypeForwardPointer";
+    case OpConstantTrue: return "OpConstantTrue";
+    case OpConstantFalse: return "OpConstantFalse";
+    case OpConstant: return "OpConstant";
+    case OpConstantComposite: return "OpConstantComposite";
+    case OpConstantSampler: return "OpConstantSampler";
+    case OpConstantNull: return "OpConstantNull";
+    case OpSpecConstantTrue: return "OpSpecConstantTrue";
+    case OpSpecConstantFalse: return "OpSpecConstantFalse";
+    case OpSpecConstant: return "OpSpecConstant";
+    case OpSpecConstantComposite: return "OpSpecConstantComposite";
+    case OpSpecConstantOp: return "OpSpecConstantOp";
+    case OpFunction: return "OpFunction";
+    case OpFunctionParameter: return "OpFunctionParameter";
+    case OpFunctionEnd: return "OpFunctionEnd";
+    case OpFunctionCall: return "OpFunctionCall";
+    case OpVariable: return "OpVariable";
+    case OpImageTexelPointer: return "OpImageTexelPointer";
+    case OpLoad: return "OpLoad";
+    case OpStore: return "OpStore";
+    case OpCopyMemory: return "OpCopyMemory";
+    case OpCopyMemorySized: return "OpCopyMemorySized";
+    case OpAccessChain: return "OpAccessChain";
+    case OpInBoundsAccessChain: return "OpInBoundsAccessChain";
+    case OpPtrAccessChain: return "OpPtrAccessChain";
+    case OpArrayLength: return "OpArrayLength";
+    case OpGenericPtrMemSemantics: return "OpGenericPtrMemSemantics";
+    case OpInBoundsPtrAccessChain: return "OpInBoundsPtrAccessChain";
+    case OpDecorate: return "OpDecorate";
+    case OpMemberDecorate: return "OpMemberDecorate";
+    case OpDecorationGroup: return "OpDecorationGroup";
+    case OpGroupDecorate: return "OpGroupDecorate";
+    case OpGroupMemberDecorate: return "OpGroupMemberDecorate";
+    case OpVectorExtractDynamic: return "OpVectorExtractDynamic";
+    case OpVectorInsertDynamic: return "OpVectorInsertDynamic";
+    case OpVectorShuffle: return "OpVectorShuffle";
+    case OpCompositeConstruct: return "OpCompositeConstruct";
+    case OpCompositeExtract: return "OpCompositeExtract";
+    case OpCompositeInsert: return "OpCompositeInsert";
+    case OpCopyObject: return "OpCopyObject";
+    case OpTranspose: return "OpTranspose";
+    case OpSampledImage: return "OpSampledImage";
+    case OpImageSampleImplicitLod: return "OpImageSampleImplicitLod";
+    case OpImageSampleExplicitLod: return "OpImageSampleExplicitLod";
+    case OpImageSampleDrefImplicitLod: return "OpImageSampleDrefImplicitLod";
+    case OpImageSampleDrefExplicitLod: return "OpImageSampleDrefExplicitLod";
+    case OpImageSampleProjImplicitLod: return "OpImageSampleProjImplicitLod";
+    case OpImageSampleProjExplicitLod: return "OpImageSampleProjExplicitLod";
+    case OpImageSampleProjDrefImplicitLod: return "OpImageSampleProjDrefImplicitLod";
+    case OpImageSampleProjDrefExplicitLod: return "OpImageSampleProjDrefExplicitLod";
+    case OpImageFetch: return "OpImageFetch";
+    case OpImageGather: return "OpImageGather";
+    case OpImageDrefGather: return "OpImageDrefGather";
+    case OpImageRead: return "OpImageRead";
+    case OpImageWrite: return "OpImageWrite";
+    case OpImage: return "OpImage";
+    case OpImageQueryFormat: return "OpImageQueryFormat";
+    case OpImageQueryOrder: return "OpImageQueryOrder";
+    case OpImageQuerySizeLod: return "OpImageQuerySizeLod";
+    case OpImageQuerySize: return "OpImageQuerySize";
+    case OpImageQueryLod: return "OpImageQueryLod";
+    case OpImageQueryLevels: return "OpImageQueryLevels";
+    case OpImageQuerySamples: return "OpImageQuerySamples";
+    case OpConvertFToU: return "OpConvertFToU";
+    case OpConvertFToS: return "OpConvertFToS";
+    case OpConvertSToF: return "OpConvertSToF";
+    case OpConvertUToF: return "OpConvertUToF";
+    case OpUConvert: return "OpUConvert";
+    case OpSConvert: return "OpSConvert";
+    case OpFConvert: return "OpFConvert";
+    case OpQuantizeToF16: return "OpQuantizeToF16";
+    case OpConvertPtrToU: return "OpConvertPtrToU";
+    case OpSatConvertSToU: return "OpSatConvertSToU";
+    case OpSatConvertUToS: return "OpSatConvertUToS";
+    case OpConvertUToPtr: return "OpConvertUToPtr";
+    case OpPtrCastToGeneric: return "OpPtrCastToGeneric";
+    case OpGenericCastToPtr: return "OpGenericCastToPtr";
+    case OpGenericCastToPtrExplicit: return "OpGenericCastToPtrExplicit";
+    case OpBitcast: return "OpBitcast";
+    case OpSNegate: return "OpSNegate";
+    case OpFNegate: return "OpFNegate";
+    case OpIAdd: return "OpIAdd";
+    case OpFAdd: return "OpFAdd";
+    case OpISub: return "OpISub";
+    case OpFSub: return "OpFSub";
+    case OpIMul: return "OpIMul";
+    case OpFMul: return "OpFMul";
+    case OpUDiv: return "OpUDiv";
+    case OpSDiv: return "OpSDiv";
+    case OpFDiv: return "OpFDiv";
+    case OpUMod: return "OpUMod";
+    case OpSRem: return "OpSRem";
+    case OpSMod: return "OpSMod";
+    case OpFRem: return "OpFRem";
+    case OpFMod: return "OpFMod";
+    case OpVectorTimesScalar: return "OpVectorTimesScalar";
+    case OpMatrixTimesScalar: return "OpMatrixTimesScalar";
+    case OpVectorTimesMatrix: return "OpVectorTimesMatrix";
+    case OpMatrixTimesVector: return "OpMatrixTimesVector";
+    case OpMatrixTimesMatrix: return "OpMatrixTimesMatrix";
+    case OpOuterProduct: return "OpOuterProduct";
+    case OpDot: return "OpDot";
+    case OpIAddCarry: return "OpIAddCarry";
+    case OpISubBorrow: return "OpISubBorrow";
+    case OpUMulExtended: return "OpUMulExtended";
+    case OpSMulExtended: return "OpSMulExtended";
+    case OpAny: return "OpAny";
+    case OpAll: return "OpAll";
+    case OpIsNan: return "OpIsNan";
+    case OpIsInf: return "OpIsInf";
+    case OpIsFinite: return "OpIsFinite";
+    case OpIsNormal: return "OpIsNormal";
+    case OpSignBitSet: return "OpSignBitSet";
+    case OpLessOrGreater: return "OpLessOrGreater";
+    case OpOrdered: return "OpOrdered";
+    case OpUnordered: return "OpUnordered";
+    case OpLogicalEqual: return "OpLogicalEqual";
+    case OpLogicalNotEqual: return "OpLogicalNotEqual";
+    case OpLogicalOr: return "OpLogicalOr";
+    case OpLogicalAnd: return "OpLogicalAnd";
+    case OpLogicalNot: return "OpLogicalNot";
+    case OpSelect: return "OpSelect";
+    case OpIEqual: return "OpIEqual";
+    case OpINotEqual: return "OpINotEqual";
+    case OpUGreaterThan: return "OpUGreaterThan";
+    case OpSGreaterThan: return "OpSGreaterThan";
+    case OpUGreaterThanEqual: return "OpUGreaterThanEqual";
+    case OpSGreaterThanEqual: return "OpSGreaterThanEqual";
+    case OpULessThan: return "OpULessThan";
+    case OpSLessThan: return "OpSLessThan";
+    case OpULessThanEqual: return "OpULessThanEqual";
+    case OpSLessThanEqual: return "OpSLessThanEqual";
+    case OpFOrdEqual: return "OpFOrdEqual";
+    case OpFUnordEqual: return "OpFUnordEqual";
+    case OpFOrdNotEqual: return "OpFOrdNotEqual";
+    case OpFUnordNotEqual: return "OpFUnordNotEqual";
+    case OpFOrdLessThan: return "OpFOrdLessThan";
+    case OpFUnordLessThan: return "OpFUnordLessThan";
+    case OpFOrdGreaterThan: return "OpFOrdGreaterThan";
+    case OpFUnordGreaterThan: return "OpFUnordGreaterThan";
+    case OpFOrdLessThanEqual: return "OpFOrdLessThanEqual";
+    case OpFUnordLessThanEqual: return "OpFUnordLessThanEqual";
+    case OpFOrdGreaterThanEqual: return "OpFOrdGreaterThanEqual";
+    case OpFUnordGreaterThanEqual: return "OpFUnordGreaterThanEqual";
+    case OpShiftRightLogical: return "OpShiftRightLogical";
+    case OpShiftRightArithmetic: return "OpShiftRightArithmetic";
+    case OpShiftLeftLogical: return "OpShiftLeftLogical";
+    case OpBitwiseOr: return "OpBitwiseOr";
+    case OpBitwiseXor: return "OpBitwiseXor";
+    case OpBitwiseAnd: return "OpBitwiseAnd";
+    case OpNot: return "OpNot";
+    case OpBitFieldInsert: return "OpBitFieldInsert";
+    case OpBitFieldSExtract: return "OpBitFieldSExtract";
+    case OpBitFieldUExtract: return "OpBitFieldUExtract";
+    case OpBitReverse: return "OpBitReverse";
+    case OpBitCount: return "OpBitCount";
+    case OpDPdx: return "OpDPdx";
+    case OpDPdy: return "OpDPdy";
+    case OpFwidth: return "OpFwidth";
+    case OpDPdxFine: return "OpDPdxFine";
+    case OpDPdyFine: return "OpDPdyFine";
+    case OpFwidthFine: return "OpFwidthFine";
+    case OpDPdxCoarse: return "OpDPdxCoarse";
+    case OpDPdyCoarse: return "OpDPdyCoarse";
+    case OpFwidthCoarse: return "OpFwidthCoarse";
+    case OpEmitVertex: return "OpEmitVertex";
+    case OpEndPrimitive: return "OpEndPrimitive";
+    case OpEmitStreamVertex: return "OpEmitStreamVertex";
+    case OpEndStreamPrimitive: return "OpEndStreamPrimitive";
+    case OpControlBarrier: return "OpControlBarrier";
+    case OpMemoryBarrier: return "OpMemoryBarrier";
+    case OpAtomicLoad: return "OpAtomicLoad";
+    case OpAtomicStore: return "OpAtomicStore";
+    case OpAtomicExchange: return "OpAtomicExchange";
+    case OpAtomicCompareExchange: return "OpAtomicCompareExchange";
+    case OpAtomicCompareExchangeWeak: return "OpAtomicCompareExchangeWeak";
+    case OpAtomicIIncrement: return "OpAtomicIIncrement";
+    case OpAtomicIDecrement: return "OpAtomicIDecrement";
+    case OpAtomicIAdd: return "OpAtomicIAdd";
+    case OpAtomicISub: return "OpAtomicISub";
+    case OpAtomicSMin: return "OpAtomicSMin";
+    case OpAtomicUMin: return "OpAtomicUMin";
+    case OpAtomicSMax: return "OpAtomicSMax";
+    case OpAtomicUMax: return "OpAtomicUMax";
+    case OpAtomicAnd: return "OpAtomicAnd";
+    case OpAtomicOr: return "OpAtomicOr";
+    case OpAtomicXor: return "OpAtomicXor";
+    case OpPhi: return "OpPhi";
+    case OpLoopMerge: return "OpLoopMerge";
+    case OpSelectionMerge: return "OpSelectionMerge";
+    case OpLabel: return "OpLabel";
+    case OpBranch: return "OpBranch";
+    case OpBranchConditional: return "OpBranchConditional";
+    case OpSwitch: return "OpSwitch";
+    case OpKill: return "OpKill";
+    case OpReturn: return "OpReturn";
+    case OpReturnValue: return "OpReturnValue";
+    case OpUnreachable: return "OpUnreachable";
+    case OpLifetimeStart: return "OpLifetimeStart";
+    case OpLifetimeStop: return "OpLifetimeStop";
+    case OpGroupAsyncCopy: return "OpGroupAsyncCopy";
+    case OpGroupWaitEvents: return "OpGroupWaitEvents";
+    case OpGroupAll: return "OpGroupAll";
+    case OpGroupAny: return "OpGroupAny";
+    case OpGroupBroadcast: return "OpGroupBroadcast";
+    case OpGroupIAdd: return "OpGroupIAdd";
+    case OpGroupFAdd: return "OpGroupFAdd";
+    case OpGroupFMin: return "OpGroupFMin";
+    case OpGroupUMin: return "OpGroupUMin";
+    case OpGroupSMin: return "OpGroupSMin";
+    case OpGroupFMax: return "OpGroupFMax";
+    case OpGroupUMax: return "OpGroupUMax";
+    case OpGroupSMax: return "OpGroupSMax";
+    case OpReadPipe: return "OpReadPipe";
+    case OpWritePipe: return "OpWritePipe";
+    case OpReservedReadPipe: return "OpReservedReadPipe";
+    case OpReservedWritePipe: return "OpReservedWritePipe";
+    case OpReserveReadPipePackets: return "OpReserveReadPipePackets";
+    case OpReserveWritePipePackets: return "OpReserveWritePipePackets";
+    case OpCommitReadPipe: return "OpCommitReadPipe";
+    case OpCommitWritePipe: return "OpCommitWritePipe";
+    case OpIsValidReserveId: return "OpIsValidReserveId";
+    case OpGetNumPipePackets: return "OpGetNumPipePackets";
+    case OpGetMaxPipePackets: return "OpGetMaxPipePackets";
+    case OpGroupReserveReadPipePackets: return "OpGroupReserveReadPipePackets";
+    case OpGroupReserveWritePipePackets: return "OpGroupReserveWritePipePackets";
+    case OpGroupCommitReadPipe: return "OpGroupCommitReadPipe";
+    case OpGroupCommitWritePipe: return "OpGroupCommitWritePipe";
+    case OpEnqueueMarker: return "OpEnqueueMarker";
+    case OpEnqueueKernel: return "OpEnqueueKernel";
+    case OpGetKernelNDrangeSubGroupCount: return "OpGetKernelNDrangeSubGroupCount";
+    case OpGetKernelNDrangeMaxSubGroupSize: return "OpGetKernelNDrangeMaxSubGroupSize";
+    case OpGetKernelWorkGroupSize: return "OpGetKernelWorkGroupSize";
+    case OpGetKernelPreferredWorkGroupSizeMultiple: return "OpGetKernelPreferredWorkGroupSizeMultiple";
+    case OpRetainEvent: return "OpRetainEvent";
+    case OpReleaseEvent: return "OpReleaseEvent";
+    case OpCreateUserEvent: return "OpCreateUserEvent";
+    case OpIsValidEvent: return "OpIsValidEvent";
+    case OpSetUserEventStatus: return "OpSetUserEventStatus";
+    case OpCaptureEventProfilingInfo: return "OpCaptureEventProfilingInfo";
+    case OpGetDefaultQueue: return "OpGetDefaultQueue";
+    case OpBuildNDRange: return "OpBuildNDRange";
+    case OpImageSparseSampleImplicitLod: return "OpImageSparseSampleImplicitLod";
+    case OpImageSparseSampleExplicitLod: return "OpImageSparseSampleExplicitLod";
+    case OpImageSparseSampleDrefImplicitLod: return "OpImageSparseSampleDrefImplicitLod";
+    case OpImageSparseSampleDrefExplicitLod: return "OpImageSparseSampleDrefExplicitLod";
+    case OpImageSparseSampleProjImplicitLod: return "OpImageSparseSampleProjImplicitLod";
+    case OpImageSparseSampleProjExplicitLod: return "OpImageSparseSampleProjExplicitLod";
+    case OpImageSparseSampleProjDrefImplicitLod: return "OpImageSparseSampleProjDrefImplicitLod";
+    case OpImageSparseSampleProjDrefExplicitLod: return "OpImageSparseSampleProjDrefExplicitLod";
+    case OpImageSparseFetch: return "OpImageSparseFetch";
+    case OpImageSparseGather: return "OpImageSparseGather";
+    case OpImageSparseDrefGather: return "OpImageSparseDrefGather";
+    case OpImageSparseTexelsResident: return "OpImageSparseTexelsResident";
+    case OpNoLine: return "OpNoLine";
+    case OpAtomicFlagTestAndSet: return "OpAtomicFlagTestAndSet";
+    case OpAtomicFlagClear: return "OpAtomicFlagClear";
+    case OpImageSparseRead: return "OpImageSparseRead";
+    case OpSizeOf: return "OpSizeOf";
+    case OpTypePipeStorage: return "OpTypePipeStorage";
+    case OpConstantPipeStorage: return "OpConstantPipeStorage";
+    case OpCreatePipeFromPipeStorage: return "OpCreatePipeFromPipeStorage";
+    case OpGetKernelLocalSizeForSubgroupCount: return "OpGetKernelLocalSizeForSubgroupCount";
+    case OpGetKernelMaxNumSubgroups: return "OpGetKernelMaxNumSubgroups";
+    case OpTypeNamedBarrier: return "OpTypeNamedBarrier";
+    case OpNamedBarrierInitialize: return "OpNamedBarrierInitialize";
+    case OpMemoryNamedBarrier: return "OpMemoryNamedBarrier";
+    case OpModuleProcessed: return "OpModuleProcessed";
+    case OpExecutionModeId: return "OpExecutionModeId";
+    case OpDecorateId: return "OpDecorateId";
+    case OpGroupNonUniformElect: return "OpGroupNonUniformElect";
+    case OpGroupNonUniformAll: return "OpGroupNonUniformAll";
+    case OpGroupNonUniformAny: return "OpGroupNonUniformAny";
+    case OpGroupNonUniformAllEqual: return "OpGroupNonUniformAllEqual";
+    case OpGroupNonUniformBroadcast: return "OpGroupNonUniformBroadcast";
+    case OpGroupNonUniformBroadcastFirst: return "OpGroupNonUniformBroadcastFirst";
+    case OpGroupNonUniformBallot: return "OpGroupNonUniformBallot";
+    case OpGroupNonUniformInverseBallot: return "OpGroupNonUniformInverseBallot";
+    case OpGroupNonUniformBallotBitExtract: return "OpGroupNonUniformBallotBitExtract";
+    case OpGroupNonUniformBallotBitCount: return "OpGroupNonUniformBallotBitCount";
+    case OpGroupNonUniformBallotFindLSB: return "OpGroupNonUniformBallotFindLSB";
+    case OpGroupNonUniformBallotFindMSB: return "OpGroupNonUniformBallotFindMSB";
+    case OpGroupNonUniformShuffle: return "OpGroupNonUniformShuffle";
+    case OpGroupNonUniformShuffleXor: return "OpGroupNonUniformShuffleXor";
+    case OpGroupNonUniformShuffleUp: return "OpGroupNonUniformShuffleUp";
+    case OpGroupNonUniformShuffleDown: return "OpGroupNonUniformShuffleDown";
+    case OpGroupNonUniformIAdd: return "OpGroupNonUniformIAdd";
+    case OpGroupNonUniformFAdd: return "OpGroupNonUniformFAdd";
+    case OpGroupNonUniformIMul: return "OpGroupNonUniformIMul";
+    case OpGroupNonUniformFMul: return "OpGroupNonUniformFMul";
+    case OpGroupNonUniformSMin: return "OpGroupNonUniformSMin";
+    case OpGroupNonUniformUMin: return "OpGroupNonUniformUMin";
+    case OpGroupNonUniformFMin: return "OpGroupNonUniformFMin";
+    case OpGroupNonUniformSMax: return "OpGroupNonUniformSMax";
+    case OpGroupNonUniformUMax: return "OpGroupNonUniformUMax";
+    case OpGroupNonUniformFMax: return "OpGroupNonUniformFMax";
+    case OpGroupNonUniformBitwiseAnd: return "OpGroupNonUniformBitwiseAnd";
+    case OpGroupNonUniformBitwiseOr: return "OpGroupNonUniformBitwiseOr";
+    case OpGroupNonUniformBitwiseXor: return "OpGroupNonUniformBitwiseXor";
+    case OpGroupNonUniformLogicalAnd: return "OpGroupNonUniformLogicalAnd";
+    case OpGroupNonUniformLogicalOr: return "OpGroupNonUniformLogicalOr";
+    case OpGroupNonUniformLogicalXor: return "OpGroupNonUniformLogicalXor";
+    case OpGroupNonUniformQuadBroadcast: return "OpGroupNonUniformQuadBroadcast";
+    case OpGroupNonUniformQuadSwap: return "OpGroupNonUniformQuadSwap";
+    case OpCopyLogical: return "OpCopyLogical";
+    case OpPtrEqual: return "OpPtrEqual";
+    case OpPtrNotEqual: return "OpPtrNotEqual";
+    case OpPtrDiff: return "OpPtrDiff";
+    case OpColorAttachmentReadEXT: return "OpColorAttachmentReadEXT";
+    case OpDepthAttachmentReadEXT: return "OpDepthAttachmentReadEXT";
+    case OpStencilAttachmentReadEXT: return "OpStencilAttachmentReadEXT";
+    case OpTerminateInvocation: return "OpTerminateInvocation";
+    case OpTypeUntypedPointerKHR: return "OpTypeUntypedPointerKHR";
+    case OpUntypedVariableKHR: return "OpUntypedVariableKHR";
+    case OpUntypedAccessChainKHR: return "OpUntypedAccessChainKHR";
+    case OpUntypedInBoundsAccessChainKHR: return "OpUntypedInBoundsAccessChainKHR";
+    case OpSubgroupBallotKHR: return "OpSubgroupBallotKHR";
+    case OpSubgroupFirstInvocationKHR: return "OpSubgroupFirstInvocationKHR";
+    case OpUntypedPtrAccessChainKHR: return "OpUntypedPtrAccessChainKHR";
+    case OpUntypedInBoundsPtrAccessChainKHR: return "OpUntypedInBoundsPtrAccessChainKHR";
+    case OpUntypedArrayLengthKHR: return "OpUntypedArrayLengthKHR";
+    case OpUntypedPrefetchKHR: return "OpUntypedPrefetchKHR";
+    case OpSubgroupAllKHR: return "OpSubgroupAllKHR";
+    case OpSubgroupAnyKHR: return "OpSubgroupAnyKHR";
+    case OpSubgroupAllEqualKHR: return "OpSubgroupAllEqualKHR";
+    case OpGroupNonUniformRotateKHR: return "OpGroupNonUniformRotateKHR";
+    case OpSubgroupReadInvocationKHR: return "OpSubgroupReadInvocationKHR";
+    case OpExtInstWithForwardRefsKHR: return "OpExtInstWithForwardRefsKHR";
+    case OpTraceRayKHR: return "OpTraceRayKHR";
+    case OpExecuteCallableKHR: return "OpExecuteCallableKHR";
+    case OpConvertUToAccelerationStructureKHR: return "OpConvertUToAccelerationStructureKHR";
+    case OpIgnoreIntersectionKHR: return "OpIgnoreIntersectionKHR";
+    case OpTerminateRayKHR: return "OpTerminateRayKHR";
+    case OpSDot: return "OpSDot";
+    case OpUDot: return "OpUDot";
+    case OpSUDot: return "OpSUDot";
+    case OpSDotAccSat: return "OpSDotAccSat";
+    case OpUDotAccSat: return "OpUDotAccSat";
+    case OpSUDotAccSat: return "OpSUDotAccSat";
+    case OpTypeCooperativeMatrixKHR: return "OpTypeCooperativeMatrixKHR";
+    case OpCooperativeMatrixLoadKHR: return "OpCooperativeMatrixLoadKHR";
+    case OpCooperativeMatrixStoreKHR: return "OpCooperativeMatrixStoreKHR";
+    case OpCooperativeMatrixMulAddKHR: return "OpCooperativeMatrixMulAddKHR";
+    case OpCooperativeMatrixLengthKHR: return "OpCooperativeMatrixLengthKHR";
+    case OpConstantCompositeReplicateEXT: return "OpConstantCompositeReplicateEXT";
+    case OpSpecConstantCompositeReplicateEXT: return "OpSpecConstantCompositeReplicateEXT";
+    case OpCompositeConstructReplicateEXT: return "OpCompositeConstructReplicateEXT";
+    case OpTypeRayQueryKHR: return "OpTypeRayQueryKHR";
+    case OpRayQueryInitializeKHR: return "OpRayQueryInitializeKHR";
+    case OpRayQueryTerminateKHR: return "OpRayQueryTerminateKHR";
+    case OpRayQueryGenerateIntersectionKHR: return "OpRayQueryGenerateIntersectionKHR";
+    case OpRayQueryConfirmIntersectionKHR: return "OpRayQueryConfirmIntersectionKHR";
+    case OpRayQueryProceedKHR: return "OpRayQueryProceedKHR";
+    case OpRayQueryGetIntersectionTypeKHR: return "OpRayQueryGetIntersectionTypeKHR";
+    case OpImageSampleWeightedQCOM: return "OpImageSampleWeightedQCOM";
+    case OpImageBoxFilterQCOM: return "OpImageBoxFilterQCOM";
+    case OpImageBlockMatchSSDQCOM: return "OpImageBlockMatchSSDQCOM";
+    case OpImageBlockMatchSADQCOM: return "OpImageBlockMatchSADQCOM";
+    case OpImageBlockMatchWindowSSDQCOM: return "OpImageBlockMatchWindowSSDQCOM";
+    case OpImageBlockMatchWindowSADQCOM: return "OpImageBlockMatchWindowSADQCOM";
+    case OpImageBlockMatchGatherSSDQCOM: return "OpImageBlockMatchGatherSSDQCOM";
+    case OpImageBlockMatchGatherSADQCOM: return "OpImageBlockMatchGatherSADQCOM";
+    case OpGroupIAddNonUniformAMD: return "OpGroupIAddNonUniformAMD";
+    case OpGroupFAddNonUniformAMD: return "OpGroupFAddNonUniformAMD";
+    case OpGroupFMinNonUniformAMD: return "OpGroupFMinNonUniformAMD";
+    case OpGroupUMinNonUniformAMD: return "OpGroupUMinNonUniformAMD";
+    case OpGroupSMinNonUniformAMD: return "OpGroupSMinNonUniformAMD";
+    case OpGroupFMaxNonUniformAMD: return "OpGroupFMaxNonUniformAMD";
+    case OpGroupUMaxNonUniformAMD: return "OpGroupUMaxNonUniformAMD";
+    case OpGroupSMaxNonUniformAMD: return "OpGroupSMaxNonUniformAMD";
+    case OpFragmentMaskFetchAMD: return "OpFragmentMaskFetchAMD";
+    case OpFragmentFetchAMD: return "OpFragmentFetchAMD";
+    case OpReadClockKHR: return "OpReadClockKHR";
+    case OpFinalizeNodePayloadsAMDX: return "OpFinalizeNodePayloadsAMDX";
+    case OpFinishWritingNodePayloadAMDX: return "OpFinishWritingNodePayloadAMDX";
+    case OpInitializeNodePayloadsAMDX: return "OpInitializeNodePayloadsAMDX";
+    case OpGroupNonUniformQuadAllKHR: return "OpGroupNonUniformQuadAllKHR";
+    case OpGroupNonUniformQuadAnyKHR: return "OpGroupNonUniformQuadAnyKHR";
+    case OpHitObjectRecordHitMotionNV: return "OpHitObjectRecordHitMotionNV";
+    case OpHitObjectRecordHitWithIndexMotionNV: return "OpHitObjectRecordHitWithIndexMotionNV";
+    case OpHitObjectRecordMissMotionNV: return "OpHitObjectRecordMissMotionNV";
+    case OpHitObjectGetWorldToObjectNV: return "OpHitObjectGetWorldToObjectNV";
+    case OpHitObjectGetObjectToWorldNV: return "OpHitObjectGetObjectToWorldNV";
+    case OpHitObjectGetObjectRayDirectionNV: return "OpHitObjectGetObjectRayDirectionNV";
+    case OpHitObjectGetObjectRayOriginNV: return "OpHitObjectGetObjectRayOriginNV";
+    case OpHitObjectTraceRayMotionNV: return "OpHitObjectTraceRayMotionNV";
+    case OpHitObjectGetShaderRecordBufferHandleNV: return "OpHitObjectGetShaderRecordBufferHandleNV";
+    case OpHitObjectGetShaderBindingTableRecordIndexNV: return "OpHitObjectGetShaderBindingTableRecordIndexNV";
+    case OpHitObjectRecordEmptyNV: return "OpHitObjectRecordEmptyNV";
+    case OpHitObjectTraceRayNV: return "OpHitObjectTraceRayNV";
+    case OpHitObjectRecordHitNV: return "OpHitObjectRecordHitNV";
+    case OpHitObjectRecordHitWithIndexNV: return "OpHitObjectRecordHitWithIndexNV";
+    case OpHitObjectRecordMissNV: return "OpHitObjectRecordMissNV";
+    case OpHitObjectExecuteShaderNV: return "OpHitObjectExecuteShaderNV";
+    case OpHitObjectGetCurrentTimeNV: return "OpHitObjectGetCurrentTimeNV";
+    case OpHitObjectGetAttributesNV: return "OpHitObjectGetAttributesNV";
+    case OpHitObjectGetHitKindNV: return "OpHitObjectGetHitKindNV";
+    case OpHitObjectGetPrimitiveIndexNV: return "OpHitObjectGetPrimitiveIndexNV";
+    case OpHitObjectGetGeometryIndexNV: return "OpHitObjectGetGeometryIndexNV";
+    case OpHitObjectGetInstanceIdNV: return "OpHitObjectGetInstanceIdNV";
+    case OpHitObjectGetInstanceCustomIndexNV: return "OpHitObjectGetInstanceCustomIndexNV";
+    case OpHitObjectGetWorldRayDirectionNV: return "OpHitObjectGetWorldRayDirectionNV";
+    case OpHitObjectGetWorldRayOriginNV: return "OpHitObjectGetWorldRayOriginNV";
+    case OpHitObjectGetRayTMaxNV: return "OpHitObjectGetRayTMaxNV";
+    case OpHitObjectGetRayTMinNV: return "OpHitObjectGetRayTMinNV";
+    case OpHitObjectIsEmptyNV: return "OpHitObjectIsEmptyNV";
+    case OpHitObjectIsHitNV: return "OpHitObjectIsHitNV";
+    case OpHitObjectIsMissNV: return "OpHitObjectIsMissNV";
+    case OpReorderThreadWithHitObjectNV: return "OpReorderThreadWithHitObjectNV";
+    case OpReorderThreadWithHintNV: return "OpReorderThreadWithHintNV";
+    case OpTypeHitObjectNV: return "OpTypeHitObjectNV";
+    case OpImageSampleFootprintNV: return "OpImageSampleFootprintNV";
+    case OpEmitMeshTasksEXT: return "OpEmitMeshTasksEXT";
+    case OpSetMeshOutputsEXT: return "OpSetMeshOutputsEXT";
+    case OpGroupNonUniformPartitionNV: return "OpGroupNonUniformPartitionNV";
+    case OpWritePackedPrimitiveIndices4x8NV: return "OpWritePackedPrimitiveIndices4x8NV";
+    case OpFetchMicroTriangleVertexPositionNV: return "OpFetchMicroTriangleVertexPositionNV";
+    case OpFetchMicroTriangleVertexBarycentricNV: return "OpFetchMicroTriangleVertexBarycentricNV";
+    case OpReportIntersectionKHR: return "OpReportIntersectionKHR";
+    case OpIgnoreIntersectionNV: return "OpIgnoreIntersectionNV";
+    case OpTerminateRayNV: return "OpTerminateRayNV";
+    case OpTraceNV: return "OpTraceNV";
+    case OpTraceMotionNV: return "OpTraceMotionNV";
+    case OpTraceRayMotionNV: return "OpTraceRayMotionNV";
+    case OpRayQueryGetIntersectionTriangleVertexPositionsKHR: return "OpRayQueryGetIntersectionTriangleVertexPositionsKHR";
+    case OpTypeAccelerationStructureKHR: return "OpTypeAccelerationStructureKHR";
+    case OpExecuteCallableNV: return "OpExecuteCallableNV";
+    case OpTypeCooperativeMatrixNV: return "OpTypeCooperativeMatrixNV";
+    case OpCooperativeMatrixLoadNV: return "OpCooperativeMatrixLoadNV";
+    case OpCooperativeMatrixStoreNV: return "OpCooperativeMatrixStoreNV";
+    case OpCooperativeMatrixMulAddNV: return "OpCooperativeMatrixMulAddNV";
+    case OpCooperativeMatrixLengthNV: return "OpCooperativeMatrixLengthNV";
+    case OpBeginInvocationInterlockEXT: return "OpBeginInvocationInterlockEXT";
+    case OpEndInvocationInterlockEXT: return "OpEndInvocationInterlockEXT";
+    case OpDemoteToHelperInvocation: return "OpDemoteToHelperInvocation";
+    case OpIsHelperInvocationEXT: return "OpIsHelperInvocationEXT";
+    case OpConvertUToImageNV: return "OpConvertUToImageNV";
+    case OpConvertUToSamplerNV: return "OpConvertUToSamplerNV";
+    case OpConvertImageToUNV: return "OpConvertImageToUNV";
+    case OpConvertSamplerToUNV: return "OpConvertSamplerToUNV";
+    case OpConvertUToSampledImageNV: return "OpConvertUToSampledImageNV";
+    case OpConvertSampledImageToUNV: return "OpConvertSampledImageToUNV";
+    case OpSamplerImageAddressingModeNV: return "OpSamplerImageAddressingModeNV";
+    case OpRawAccessChainNV: return "OpRawAccessChainNV";
+    case OpSubgroupShuffleINTEL: return "OpSubgroupShuffleINTEL";
+    case OpSubgroupShuffleDownINTEL: return "OpSubgroupShuffleDownINTEL";
+    case OpSubgroupShuffleUpINTEL: return "OpSubgroupShuffleUpINTEL";
+    case OpSubgroupShuffleXorINTEL: return "OpSubgroupShuffleXorINTEL";
+    case OpSubgroupBlockReadINTEL: return "OpSubgroupBlockReadINTEL";
+    case OpSubgroupBlockWriteINTEL: return "OpSubgroupBlockWriteINTEL";
+    case OpSubgroupImageBlockReadINTEL: return "OpSubgroupImageBlockReadINTEL";
+    case OpSubgroupImageBlockWriteINTEL: return "OpSubgroupImageBlockWriteINTEL";
+    case OpSubgroupImageMediaBlockReadINTEL: return "OpSubgroupImageMediaBlockReadINTEL";
+    case OpSubgroupImageMediaBlockWriteINTEL: return "OpSubgroupImageMediaBlockWriteINTEL";
+    case OpUCountLeadingZerosINTEL: return "OpUCountLeadingZerosINTEL";
+    case OpUCountTrailingZerosINTEL: return "OpUCountTrailingZerosINTEL";
+    case OpAbsISubINTEL: return "OpAbsISubINTEL";
+    case OpAbsUSubINTEL: return "OpAbsUSubINTEL";
+    case OpIAddSatINTEL: return "OpIAddSatINTEL";
+    case OpUAddSatINTEL: return "OpUAddSatINTEL";
+    case OpIAverageINTEL: return "OpIAverageINTEL";
+    case OpUAverageINTEL: return "OpUAverageINTEL";
+    case OpIAverageRoundedINTEL: return "OpIAverageRoundedINTEL";
+    case OpUAverageRoundedINTEL: return "OpUAverageRoundedINTEL";
+    case OpISubSatINTEL: return "OpISubSatINTEL";
+    case OpUSubSatINTEL: return "OpUSubSatINTEL";
+    case OpIMul32x16INTEL: return "OpIMul32x16INTEL";
+    case OpUMul32x16INTEL: return "OpUMul32x16INTEL";
+    case OpConstantFunctionPointerINTEL: return "OpConstantFunctionPointerINTEL";
+    case OpFunctionPointerCallINTEL: return "OpFunctionPointerCallINTEL";
+    case OpAsmTargetINTEL: return "OpAsmTargetINTEL";
+    case OpAsmINTEL: return "OpAsmINTEL";
+    case OpAsmCallINTEL: return "OpAsmCallINTEL";
+    case OpAtomicFMinEXT: return "OpAtomicFMinEXT";
+    case OpAtomicFMaxEXT: return "OpAtomicFMaxEXT";
+    case OpAssumeTrueKHR: return "OpAssumeTrueKHR";
+    case OpExpectKHR: return "OpExpectKHR";
+    case OpDecorateString: return "OpDecorateString";
+    case OpMemberDecorateString: return "OpMemberDecorateString";
+    case OpVmeImageINTEL: return "OpVmeImageINTEL";
+    case OpTypeVmeImageINTEL: return "OpTypeVmeImageINTEL";
+    case OpTypeAvcImePayloadINTEL: return "OpTypeAvcImePayloadINTEL";
+    case OpTypeAvcRefPayloadINTEL: return "OpTypeAvcRefPayloadINTEL";
+    case OpTypeAvcSicPayloadINTEL: return "OpTypeAvcSicPayloadINTEL";
+    case OpTypeAvcMcePayloadINTEL: return "OpTypeAvcMcePayloadINTEL";
+    case OpTypeAvcMceResultINTEL: return "OpTypeAvcMceResultINTEL";
+    case OpTypeAvcImeResultINTEL: return "OpTypeAvcImeResultINTEL";
+    case OpTypeAvcImeResultSingleReferenceStreamoutINTEL: return "OpTypeAvcImeResultSingleReferenceStreamoutINTEL";
+    case OpTypeAvcImeResultDualReferenceStreamoutINTEL: return "OpTypeAvcImeResultDualReferenceStreamoutINTEL";
+    case OpTypeAvcImeSingleReferenceStreaminINTEL: return "OpTypeAvcImeSingleReferenceStreaminINTEL";
+    case OpTypeAvcImeDualReferenceStreaminINTEL: return "OpTypeAvcImeDualReferenceStreaminINTEL";
+    case OpTypeAvcRefResultINTEL: return "OpTypeAvcRefResultINTEL";
+    case OpTypeAvcSicResultINTEL: return "OpTypeAvcSicResultINTEL";
+    case OpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL";
+    case OpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL: return "OpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL";
+    case OpSubgroupAvcMceSetInterShapePenaltyINTEL: return "OpSubgroupAvcMceSetInterShapePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL: return "OpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL";
+    case OpSubgroupAvcMceSetInterDirectionPenaltyINTEL: return "OpSubgroupAvcMceSetInterDirectionPenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL: return "OpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL";
+    case OpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL: return "OpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL";
+    case OpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL: return "OpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL";
+    case OpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL: return "OpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL";
+    case OpSubgroupAvcMceSetMotionVectorCostFunctionINTEL: return "OpSubgroupAvcMceSetMotionVectorCostFunctionINTEL";
+    case OpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL: return "OpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL";
+    case OpSubgroupAvcMceSetAcOnlyHaarINTEL: return "OpSubgroupAvcMceSetAcOnlyHaarINTEL";
+    case OpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL: return "OpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL";
+    case OpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL: return "OpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL";
+    case OpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL: return "OpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL";
+    case OpSubgroupAvcMceConvertToImePayloadINTEL: return "OpSubgroupAvcMceConvertToImePayloadINTEL";
+    case OpSubgroupAvcMceConvertToImeResultINTEL: return "OpSubgroupAvcMceConvertToImeResultINTEL";
+    case OpSubgroupAvcMceConvertToRefPayloadINTEL: return "OpSubgroupAvcMceConvertToRefPayloadINTEL";
+    case OpSubgroupAvcMceConvertToRefResultINTEL: return "OpSubgroupAvcMceConvertToRefResultINTEL";
+    case OpSubgroupAvcMceConvertToSicPayloadINTEL: return "OpSubgroupAvcMceConvertToSicPayloadINTEL";
+    case OpSubgroupAvcMceConvertToSicResultINTEL: return "OpSubgroupAvcMceConvertToSicResultINTEL";
+    case OpSubgroupAvcMceGetMotionVectorsINTEL: return "OpSubgroupAvcMceGetMotionVectorsINTEL";
+    case OpSubgroupAvcMceGetInterDistortionsINTEL: return "OpSubgroupAvcMceGetInterDistortionsINTEL";
+    case OpSubgroupAvcMceGetBestInterDistortionsINTEL: return "OpSubgroupAvcMceGetBestInterDistortionsINTEL";
+    case OpSubgroupAvcMceGetInterMajorShapeINTEL: return "OpSubgroupAvcMceGetInterMajorShapeINTEL";
+    case OpSubgroupAvcMceGetInterMinorShapeINTEL: return "OpSubgroupAvcMceGetInterMinorShapeINTEL";
+    case OpSubgroupAvcMceGetInterDirectionsINTEL: return "OpSubgroupAvcMceGetInterDirectionsINTEL";
+    case OpSubgroupAvcMceGetInterMotionVectorCountINTEL: return "OpSubgroupAvcMceGetInterMotionVectorCountINTEL";
+    case OpSubgroupAvcMceGetInterReferenceIdsINTEL: return "OpSubgroupAvcMceGetInterReferenceIdsINTEL";
+    case OpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL: return "OpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL";
+    case OpSubgroupAvcImeInitializeINTEL: return "OpSubgroupAvcImeInitializeINTEL";
+    case OpSubgroupAvcImeSetSingleReferenceINTEL: return "OpSubgroupAvcImeSetSingleReferenceINTEL";
+    case OpSubgroupAvcImeSetDualReferenceINTEL: return "OpSubgroupAvcImeSetDualReferenceINTEL";
+    case OpSubgroupAvcImeRefWindowSizeINTEL: return "OpSubgroupAvcImeRefWindowSizeINTEL";
+    case OpSubgroupAvcImeAdjustRefOffsetINTEL: return "OpSubgroupAvcImeAdjustRefOffsetINTEL";
+    case OpSubgroupAvcImeConvertToMcePayloadINTEL: return "OpSubgroupAvcImeConvertToMcePayloadINTEL";
+    case OpSubgroupAvcImeSetMaxMotionVectorCountINTEL: return "OpSubgroupAvcImeSetMaxMotionVectorCountINTEL";
+    case OpSubgroupAvcImeSetUnidirectionalMixDisableINTEL: return "OpSubgroupAvcImeSetUnidirectionalMixDisableINTEL";
+    case OpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL: return "OpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL";
+    case OpSubgroupAvcImeSetWeightedSadINTEL: return "OpSubgroupAvcImeSetWeightedSadINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL";
+    case OpSubgroupAvcImeConvertToMceResultINTEL: return "OpSubgroupAvcImeConvertToMceResultINTEL";
+    case OpSubgroupAvcImeGetSingleReferenceStreaminINTEL: return "OpSubgroupAvcImeGetSingleReferenceStreaminINTEL";
+    case OpSubgroupAvcImeGetDualReferenceStreaminINTEL: return "OpSubgroupAvcImeGetDualReferenceStreaminINTEL";
+    case OpSubgroupAvcImeStripSingleReferenceStreamoutINTEL: return "OpSubgroupAvcImeStripSingleReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeStripDualReferenceStreamoutINTEL: return "OpSubgroupAvcImeStripDualReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL: return "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL";
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL: return "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL";
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL: return "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL";
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL: return "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL";
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL: return "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL";
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL: return "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL";
+    case OpSubgroupAvcImeGetBorderReachedINTEL: return "OpSubgroupAvcImeGetBorderReachedINTEL";
+    case OpSubgroupAvcImeGetTruncatedSearchIndicationINTEL: return "OpSubgroupAvcImeGetTruncatedSearchIndicationINTEL";
+    case OpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL: return "OpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL";
+    case OpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL: return "OpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL";
+    case OpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL: return "OpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL";
+    case OpSubgroupAvcFmeInitializeINTEL: return "OpSubgroupAvcFmeInitializeINTEL";
+    case OpSubgroupAvcBmeInitializeINTEL: return "OpSubgroupAvcBmeInitializeINTEL";
+    case OpSubgroupAvcRefConvertToMcePayloadINTEL: return "OpSubgroupAvcRefConvertToMcePayloadINTEL";
+    case OpSubgroupAvcRefSetBidirectionalMixDisableINTEL: return "OpSubgroupAvcRefSetBidirectionalMixDisableINTEL";
+    case OpSubgroupAvcRefSetBilinearFilterEnableINTEL: return "OpSubgroupAvcRefSetBilinearFilterEnableINTEL";
+    case OpSubgroupAvcRefEvaluateWithSingleReferenceINTEL: return "OpSubgroupAvcRefEvaluateWithSingleReferenceINTEL";
+    case OpSubgroupAvcRefEvaluateWithDualReferenceINTEL: return "OpSubgroupAvcRefEvaluateWithDualReferenceINTEL";
+    case OpSubgroupAvcRefEvaluateWithMultiReferenceINTEL: return "OpSubgroupAvcRefEvaluateWithMultiReferenceINTEL";
+    case OpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL: return "OpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL";
+    case OpSubgroupAvcRefConvertToMceResultINTEL: return "OpSubgroupAvcRefConvertToMceResultINTEL";
+    case OpSubgroupAvcSicInitializeINTEL: return "OpSubgroupAvcSicInitializeINTEL";
+    case OpSubgroupAvcSicConfigureSkcINTEL: return "OpSubgroupAvcSicConfigureSkcINTEL";
+    case OpSubgroupAvcSicConfigureIpeLumaINTEL: return "OpSubgroupAvcSicConfigureIpeLumaINTEL";
+    case OpSubgroupAvcSicConfigureIpeLumaChromaINTEL: return "OpSubgroupAvcSicConfigureIpeLumaChromaINTEL";
+    case OpSubgroupAvcSicGetMotionVectorMaskINTEL: return "OpSubgroupAvcSicGetMotionVectorMaskINTEL";
+    case OpSubgroupAvcSicConvertToMcePayloadINTEL: return "OpSubgroupAvcSicConvertToMcePayloadINTEL";
+    case OpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL: return "OpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL";
+    case OpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL: return "OpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL";
+    case OpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL: return "OpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL";
+    case OpSubgroupAvcSicSetBilinearFilterEnableINTEL: return "OpSubgroupAvcSicSetBilinearFilterEnableINTEL";
+    case OpSubgroupAvcSicSetSkcForwardTransformEnableINTEL: return "OpSubgroupAvcSicSetSkcForwardTransformEnableINTEL";
+    case OpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL: return "OpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL";
+    case OpSubgroupAvcSicEvaluateIpeINTEL: return "OpSubgroupAvcSicEvaluateIpeINTEL";
+    case OpSubgroupAvcSicEvaluateWithSingleReferenceINTEL: return "OpSubgroupAvcSicEvaluateWithSingleReferenceINTEL";
+    case OpSubgroupAvcSicEvaluateWithDualReferenceINTEL: return "OpSubgroupAvcSicEvaluateWithDualReferenceINTEL";
+    case OpSubgroupAvcSicEvaluateWithMultiReferenceINTEL: return "OpSubgroupAvcSicEvaluateWithMultiReferenceINTEL";
+    case OpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL: return "OpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL";
+    case OpSubgroupAvcSicConvertToMceResultINTEL: return "OpSubgroupAvcSicConvertToMceResultINTEL";
+    case OpSubgroupAvcSicGetIpeLumaShapeINTEL: return "OpSubgroupAvcSicGetIpeLumaShapeINTEL";
+    case OpSubgroupAvcSicGetBestIpeLumaDistortionINTEL: return "OpSubgroupAvcSicGetBestIpeLumaDistortionINTEL";
+    case OpSubgroupAvcSicGetBestIpeChromaDistortionINTEL: return "OpSubgroupAvcSicGetBestIpeChromaDistortionINTEL";
+    case OpSubgroupAvcSicGetPackedIpeLumaModesINTEL: return "OpSubgroupAvcSicGetPackedIpeLumaModesINTEL";
+    case OpSubgroupAvcSicGetIpeChromaModeINTEL: return "OpSubgroupAvcSicGetIpeChromaModeINTEL";
+    case OpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL: return "OpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL";
+    case OpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL: return "OpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL";
+    case OpSubgroupAvcSicGetInterRawSadsINTEL: return "OpSubgroupAvcSicGetInterRawSadsINTEL";
+    case OpVariableLengthArrayINTEL: return "OpVariableLengthArrayINTEL";
+    case OpSaveMemoryINTEL: return "OpSaveMemoryINTEL";
+    case OpRestoreMemoryINTEL: return "OpRestoreMemoryINTEL";
+    case OpArbitraryFloatSinCosPiINTEL: return "OpArbitraryFloatSinCosPiINTEL";
+    case OpArbitraryFloatCastINTEL: return "OpArbitraryFloatCastINTEL";
+    case OpArbitraryFloatCastFromIntINTEL: return "OpArbitraryFloatCastFromIntINTEL";
+    case OpArbitraryFloatCastToIntINTEL: return "OpArbitraryFloatCastToIntINTEL";
+    case OpArbitraryFloatAddINTEL: return "OpArbitraryFloatAddINTEL";
+    case OpArbitraryFloatSubINTEL: return "OpArbitraryFloatSubINTEL";
+    case OpArbitraryFloatMulINTEL: return "OpArbitraryFloatMulINTEL";
+    case OpArbitraryFloatDivINTEL: return "OpArbitraryFloatDivINTEL";
+    case OpArbitraryFloatGTINTEL: return "OpArbitraryFloatGTINTEL";
+    case OpArbitraryFloatGEINTEL: return "OpArbitraryFloatGEINTEL";
+    case OpArbitraryFloatLTINTEL: return "OpArbitraryFloatLTINTEL";
+    case OpArbitraryFloatLEINTEL: return "OpArbitraryFloatLEINTEL";
+    case OpArbitraryFloatEQINTEL: return "OpArbitraryFloatEQINTEL";
+    case OpArbitraryFloatRecipINTEL: return "OpArbitraryFloatRecipINTEL";
+    case OpArbitraryFloatRSqrtINTEL: return "OpArbitraryFloatRSqrtINTEL";
+    case OpArbitraryFloatCbrtINTEL: return "OpArbitraryFloatCbrtINTEL";
+    case OpArbitraryFloatHypotINTEL: return "OpArbitraryFloatHypotINTEL";
+    case OpArbitraryFloatSqrtINTEL: return "OpArbitraryFloatSqrtINTEL";
+    case OpArbitraryFloatLogINTEL: return "OpArbitraryFloatLogINTEL";
+    case OpArbitraryFloatLog2INTEL: return "OpArbitraryFloatLog2INTEL";
+    case OpArbitraryFloatLog10INTEL: return "OpArbitraryFloatLog10INTEL";
+    case OpArbitraryFloatLog1pINTEL: return "OpArbitraryFloatLog1pINTEL";
+    case OpArbitraryFloatExpINTEL: return "OpArbitraryFloatExpINTEL";
+    case OpArbitraryFloatExp2INTEL: return "OpArbitraryFloatExp2INTEL";
+    case OpArbitraryFloatExp10INTEL: return "OpArbitraryFloatExp10INTEL";
+    case OpArbitraryFloatExpm1INTEL: return "OpArbitraryFloatExpm1INTEL";
+    case OpArbitraryFloatSinINTEL: return "OpArbitraryFloatSinINTEL";
+    case OpArbitraryFloatCosINTEL: return "OpArbitraryFloatCosINTEL";
+    case OpArbitraryFloatSinCosINTEL: return "OpArbitraryFloatSinCosINTEL";
+    case OpArbitraryFloatSinPiINTEL: return "OpArbitraryFloatSinPiINTEL";
+    case OpArbitraryFloatCosPiINTEL: return "OpArbitraryFloatCosPiINTEL";
+    case OpArbitraryFloatASinINTEL: return "OpArbitraryFloatASinINTEL";
+    case OpArbitraryFloatASinPiINTEL: return "OpArbitraryFloatASinPiINTEL";
+    case OpArbitraryFloatACosINTEL: return "OpArbitraryFloatACosINTEL";
+    case OpArbitraryFloatACosPiINTEL: return "OpArbitraryFloatACosPiINTEL";
+    case OpArbitraryFloatATanINTEL: return "OpArbitraryFloatATanINTEL";
+    case OpArbitraryFloatATanPiINTEL: return "OpArbitraryFloatATanPiINTEL";
+    case OpArbitraryFloatATan2INTEL: return "OpArbitraryFloatATan2INTEL";
+    case OpArbitraryFloatPowINTEL: return "OpArbitraryFloatPowINTEL";
+    case OpArbitraryFloatPowRINTEL: return "OpArbitraryFloatPowRINTEL";
+    case OpArbitraryFloatPowNINTEL: return "OpArbitraryFloatPowNINTEL";
+    case OpLoopControlINTEL: return "OpLoopControlINTEL";
+    case OpAliasDomainDeclINTEL: return "OpAliasDomainDeclINTEL";
+    case OpAliasScopeDeclINTEL: return "OpAliasScopeDeclINTEL";
+    case OpAliasScopeListDeclINTEL: return "OpAliasScopeListDeclINTEL";
+    case OpFixedSqrtINTEL: return "OpFixedSqrtINTEL";
+    case OpFixedRecipINTEL: return "OpFixedRecipINTEL";
+    case OpFixedRsqrtINTEL: return "OpFixedRsqrtINTEL";
+    case OpFixedSinINTEL: return "OpFixedSinINTEL";
+    case OpFixedCosINTEL: return "OpFixedCosINTEL";
+    case OpFixedSinCosINTEL: return "OpFixedSinCosINTEL";
+    case OpFixedSinPiINTEL: return "OpFixedSinPiINTEL";
+    case OpFixedCosPiINTEL: return "OpFixedCosPiINTEL";
+    case OpFixedSinCosPiINTEL: return "OpFixedSinCosPiINTEL";
+    case OpFixedLogINTEL: return "OpFixedLogINTEL";
+    case OpFixedExpINTEL: return "OpFixedExpINTEL";
+    case OpPtrCastToCrossWorkgroupINTEL: return "OpPtrCastToCrossWorkgroupINTEL";
+    case OpCrossWorkgroupCastToPtrINTEL: return "OpCrossWorkgroupCastToPtrINTEL";
+    case OpReadPipeBlockingINTEL: return "OpReadPipeBlockingINTEL";
+    case OpWritePipeBlockingINTEL: return "OpWritePipeBlockingINTEL";
+    case OpFPGARegINTEL: return "OpFPGARegINTEL";
+    case OpRayQueryGetRayTMinKHR: return "OpRayQueryGetRayTMinKHR";
+    case OpRayQueryGetRayFlagsKHR: return "OpRayQueryGetRayFlagsKHR";
+    case OpRayQueryGetIntersectionTKHR: return "OpRayQueryGetIntersectionTKHR";
+    case OpRayQueryGetIntersectionInstanceCustomIndexKHR: return "OpRayQueryGetIntersectionInstanceCustomIndexKHR";
+    case OpRayQueryGetIntersectionInstanceIdKHR: return "OpRayQueryGetIntersectionInstanceIdKHR";
+    case OpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR: return "OpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR";
+    case OpRayQueryGetIntersectionGeometryIndexKHR: return "OpRayQueryGetIntersectionGeometryIndexKHR";
+    case OpRayQueryGetIntersectionPrimitiveIndexKHR: return "OpRayQueryGetIntersectionPrimitiveIndexKHR";
+    case OpRayQueryGetIntersectionBarycentricsKHR: return "OpRayQueryGetIntersectionBarycentricsKHR";
+    case OpRayQueryGetIntersectionFrontFaceKHR: return "OpRayQueryGetIntersectionFrontFaceKHR";
+    case OpRayQueryGetIntersectionCandidateAABBOpaqueKHR: return "OpRayQueryGetIntersectionCandidateAABBOpaqueKHR";
+    case OpRayQueryGetIntersectionObjectRayDirectionKHR: return "OpRayQueryGetIntersectionObjectRayDirectionKHR";
+    case OpRayQueryGetIntersectionObjectRayOriginKHR: return "OpRayQueryGetIntersectionObjectRayOriginKHR";
+    case OpRayQueryGetWorldRayDirectionKHR: return "OpRayQueryGetWorldRayDirectionKHR";
+    case OpRayQueryGetWorldRayOriginKHR: return "OpRayQueryGetWorldRayOriginKHR";
+    case OpRayQueryGetIntersectionObjectToWorldKHR: return "OpRayQueryGetIntersectionObjectToWorldKHR";
+    case OpRayQueryGetIntersectionWorldToObjectKHR: return "OpRayQueryGetIntersectionWorldToObjectKHR";
+    case OpAtomicFAddEXT: return "OpAtomicFAddEXT";
+    case OpTypeBufferSurfaceINTEL: return "OpTypeBufferSurfaceINTEL";
+    case OpTypeStructContinuedINTEL: return "OpTypeStructContinuedINTEL";
+    case OpConstantCompositeContinuedINTEL: return "OpConstantCompositeContinuedINTEL";
+    case OpSpecConstantCompositeContinuedINTEL: return "OpSpecConstantCompositeContinuedINTEL";
+    case OpCompositeConstructContinuedINTEL: return "OpCompositeConstructContinuedINTEL";
+    case OpConvertFToBF16INTEL: return "OpConvertFToBF16INTEL";
+    case OpConvertBF16ToFINTEL: return "OpConvertBF16ToFINTEL";
+    case OpControlBarrierArriveINTEL: return "OpControlBarrierArriveINTEL";
+    case OpControlBarrierWaitINTEL: return "OpControlBarrierWaitINTEL";
+    case OpSubgroupBlockPrefetchINTEL: return "OpSubgroupBlockPrefetchINTEL";
+    case OpGroupIMulKHR: return "OpGroupIMulKHR";
+    case OpGroupFMulKHR: return "OpGroupFMulKHR";
+    case OpGroupBitwiseAndKHR: return "OpGroupBitwiseAndKHR";
+    case OpGroupBitwiseOrKHR: return "OpGroupBitwiseOrKHR";
+    case OpGroupBitwiseXorKHR: return "OpGroupBitwiseXorKHR";
+    case OpGroupLogicalAndKHR: return "OpGroupLogicalAndKHR";
+    case OpGroupLogicalOrKHR: return "OpGroupLogicalOrKHR";
+    case OpGroupLogicalXorKHR: return "OpGroupLogicalXorKHR";
+    case OpMaskedGatherINTEL: return "OpMaskedGatherINTEL";
+    case OpMaskedScatterINTEL: return "OpMaskedScatterINTEL";
+    default: return "Unknown";
+    }
+}
+
 #endif /* SPV_ENABLE_UTILITY_CODE */
 
 // Overload bitwise operators for mask bit combining
@@ -2897,6 +4772,10 @@ inline CooperativeMatrixOperandsMask operator|(CooperativeMatrixOperandsMask a,
 inline CooperativeMatrixOperandsMask operator&(CooperativeMatrixOperandsMask a, CooperativeMatrixOperandsMask b) { return CooperativeMatrixOperandsMask(unsigned(a) & unsigned(b)); }
 inline CooperativeMatrixOperandsMask operator^(CooperativeMatrixOperandsMask a, CooperativeMatrixOperandsMask b) { return CooperativeMatrixOperandsMask(unsigned(a) ^ unsigned(b)); }
 inline CooperativeMatrixOperandsMask operator~(CooperativeMatrixOperandsMask a) { return CooperativeMatrixOperandsMask(~unsigned(a)); }
+inline RawAccessChainOperandsMask operator|(RawAccessChainOperandsMask a, RawAccessChainOperandsMask b) { return RawAccessChainOperandsMask(unsigned(a) | unsigned(b)); }
+inline RawAccessChainOperandsMask operator&(RawAccessChainOperandsMask a, RawAccessChainOperandsMask b) { return RawAccessChainOperandsMask(unsigned(a) & unsigned(b)); }
+inline RawAccessChainOperandsMask operator^(RawAccessChainOperandsMask a, RawAccessChainOperandsMask b) { return RawAccessChainOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline RawAccessChainOperandsMask operator~(RawAccessChainOperandsMask a) { return RawAccessChainOperandsMask(~unsigned(a)); }
 
 }  // end namespace spv
 
diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index e22e24e337..73ee3d7e77 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -536,6 +536,8 @@ struct PipelineOptions {
   bool enablePrimGeneratedQuery; ///< If set, primitive generated query is enabled
   bool disablePerCompFetch;      ///< Disable per component fetch in uber fetch shader.
   bool reserved21;
+  bool optimizePointSizeWrite;        ///< If set, the write of PointSize in the last vertex processing stage will be
+                                      ///< eliminated if the write value is 1.0.
   CompileConstInfo *compileConstInfo; ///< Compile time constant data.
 };
 
diff --git a/lgc/CMakeLists.txt b/lgc/CMakeLists.txt
index 0d0ba569b4..7596d07991 100644
--- a/lgc/CMakeLists.txt
+++ b/lgc/CMakeLists.txt
@@ -65,6 +65,12 @@ set_compiler_options(LLVMlgc ${LLPC_ENABLE_WERROR})
 
 ### TableGen for LGC dialect ###########################################################################################
 
+# Determine whether LLVM has the nodivergencesource attribute.
+include(../cmake/LlvmMainRevision.cmake)
+if (${LLVM_MAIN_REVISION} GREATER_EQUAL 514862)
+  set(LGC_TABLEGEN_FLAGS -DLLVM_HAVE_NODIVERGENCESOURCE_ATTR)
+endif()
+
 if (EXISTS ${LLVM_TOOLS_BINARY_PATH}/llvm-dialects-tblgen)
   set(LGC_TABLEGEN_EXE ${LLVM_TOOLS_BINARY_PATH}/llvm-dialects-tblgen)
 else()
@@ -76,13 +82,13 @@ set(LLVM_TARGET_DEFINITIONS interface/lgc/LgcDialect.td)
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/interface/lgc)
     file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/interface/lgc")
 endif()
-tablegen(LGC interface/lgc/LgcDialect.h.inc -gen-dialect-decls --dialect lgc
+tablegen(LGC interface/lgc/LgcDialect.h.inc -gen-dialect-decls --dialect lgc ${LGC_TABLEGEN_FLAGS}
     EXTRA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/../imported/llvm-dialects/include
     )
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/state)
     file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/state")
 endif()
-tablegen(LGC state/LgcDialect.cpp.inc -gen-dialect-defs --dialect lgc
+tablegen(LGC state/LgcDialect.cpp.inc -gen-dialect-defs --dialect lgc ${LGC_TABLEGEN_FLAGS}
     EXTRA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/../imported/llvm-dialects/include
     )
 
@@ -140,7 +146,7 @@ target_sources(LLVMlgc PRIVATE
 target_sources(LLVMlgc PRIVATE
     patch/ConfigBuilderBase.cpp
     patch/Continufy.cpp
-    patch/FragColorExport.cpp
+    patch/FragmentColorExport.cpp
     patch/LowerDebugPrintf.cpp
     patch/LowerDesc.cpp
     patch/LowerPopsInterlock.cpp
@@ -148,7 +154,8 @@ target_sources(LLVMlgc PRIVATE
     patch/MeshTaskShader.cpp
     patch/NggPrimShader.cpp
     patch/Patch.cpp
-    patch/PatchBufferOp.cpp
+    patch/StructurizeBuffers.cpp
+    patch/LowerBufferOperations.cpp
     patch/CheckShaderCache.cpp
     patch/GenerateCopyShader.cpp
     patch/MutateEntryPoint.cpp
@@ -159,15 +166,15 @@ target_sources(LLVMlgc PRIVATE
     patch/ScalarizeLoads.cpp
     patch/LowerMulDx9Zero.cpp
     patch/AddLoopMetadata.cpp
-    patch/PatchNullFragShader.cpp
-    patch/PatchPeepholeOpt.cpp
-    patch/PatchPreparePipelineAbi.cpp
-    patch/PatchReadFirstLane.cpp
-    patch/PatchResourceCollect.cpp
-    patch/PatchSetupTargetFeatures.cpp
-    patch/TcsPassthroughShader.cpp
+    patch/GenerateNullFragmentShader.cpp
+    patch/PeepholeOptimization.cpp
+    patch/PreparePipelineAbi.cpp
+    patch/LowerReadFirstLane.cpp
+    patch/CollectResourceUsage.cpp
+    patch/SetupTargetFeatures.cpp
+    patch/PassthroughHullShader.cpp
     patch/PatchInitializeWorkgroupMemory.cpp
-    patch/PatchWorkarounds.cpp
+    patch/ApplyWorkarounds.cpp
     patch/ShaderInputs.cpp
     patch/ShaderMerger.cpp
     patch/SystemValues.cpp
@@ -209,7 +216,9 @@ target_sources(LLVMlgc PRIVATE
     util/Internal.cpp
     util/MbStandardInstrumentations.cpp
     util/ModuleBunch.cpp
+    util/MsgPackScanner.cpp
     util/PassManager.cpp
+    util/RegStackUsage.cpp
     util/StartStopTimer.cpp
     util/WorkgroupLayout.cpp
 )
diff --git a/lgc/builder/BuilderRecorder.cpp b/lgc/builder/BuilderRecorder.cpp
index 143a7b5a6c..6bec003577 100644
--- a/lgc/builder/BuilderRecorder.cpp
+++ b/lgc/builder/BuilderRecorder.cpp
@@ -2051,12 +2051,18 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
     case BuilderOpcode::Tanh:
     case BuilderOpcode::SubgroupBallotBitCount:
     case BuilderOpcode::SubgroupBallotBitExtract:
-    case BuilderOpcode::SubgroupBallotExclusiveBitCount:
     case BuilderOpcode::SubgroupBallotFindLsb:
     case BuilderOpcode::SubgroupBallotFindMsb:
-    case BuilderOpcode::SubgroupBallotInclusiveBitCount:
     case BuilderOpcode::SamplerFeedbackDesc:
-      // Functions that don't access memory.
+      // Functions that don't access memory, and are not a source of divergence.
+      func->setDoesNotAccessMemory();
+#if !LLVM_MAIN_REVISION || LLVM_MAIN_REVISION >= 514862
+      func->addFnAttr(Attribute::NoDivergenceSource);
+#endif
+      break;
+    case BuilderOpcode::SubgroupBallotExclusiveBitCount:
+    case BuilderOpcode::SubgroupBallotInclusiveBitCount:
+      // Functions that don't access memory, and could be a source of divergence.
       func->setDoesNotAccessMemory();
       break;
     case BuilderOpcode::ImageSample:
@@ -2070,13 +2076,21 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
     case BuilderOpcode::ImageLoad:
     case BuilderOpcode::ImageLoadWithFmask:
     case BuilderOpcode::LoadPushConstantsPtr:
+      // Functions that only read memory and are not a source of divergence.
+      func->setOnlyReadsMemory();
+#if !LLVM_MAIN_REVISION || LLVM_MAIN_REVISION >= 514862
+      func->addFnAttr(Attribute::NoDivergenceSource);
+#endif
+      // Must be marked as returning for DCE.
+      func->addFnAttr(Attribute::WillReturn);
+      break;
     case BuilderOpcode::ReadBaryCoord:
     case BuilderOpcode::ReadBuiltInInput:
     case BuilderOpcode::ReadBuiltInOutput:
     case BuilderOpcode::ReadGenericInput:
     case BuilderOpcode::ReadGenericOutput:
     case BuilderOpcode::ReadPerVertexInput:
-      // Functions that only read memory.
+      // Functions that only read memory and could be a source of divergence.
       func->setOnlyReadsMemory();
       // Must be marked as returning for DCE.
       func->addFnAttr(Attribute::WillReturn);
@@ -2087,8 +2101,13 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
       break;
     case BuilderOpcode::ImageAtomic:
     case BuilderOpcode::ImageAtomicCompareSwap:
+      // Functions that read and write memory and could be a source of divergence.
+      break;
     case BuilderOpcode::WriteXfbOutput:
-      // Functions that read and write memory.
+      // Functions that read and write memory and are not a source of divergence.
+#if !LLVM_MAIN_REVISION || LLVM_MAIN_REVISION >= 514862
+      func->addFnAttr(Attribute::NoDivergenceSource);
+#endif
       break;
     case BuilderOpcode::SubgroupBallot:
     case BuilderOpcode::SubgroupBroadcast:
@@ -2120,6 +2139,10 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
       break;
     case BuilderOpcode::SubgroupWriteInvocation:
     case BuilderOpcode::DemoteToHelperInvocation:
+    case BuilderOpcode::IsHelperInvocation:
+      // TODO: These functions have not been classified yet, other than that we do not want to mark
+      // as NoDivergenceSource.
+      break;
     case BuilderOpcode::EmitVertex:
     case BuilderOpcode::EndPrimitive:
     case BuilderOpcode::ImageGetLod:
@@ -2127,13 +2150,16 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
     case BuilderOpcode::ImageQueryLevels:
     case BuilderOpcode::ImageQuerySamples:
     case BuilderOpcode::ImageQuerySize:
-    case BuilderOpcode::IsHelperInvocation:
     case BuilderOpcode::Kill:
     case BuilderOpcode::ReadClock:
     case BuilderOpcode::WriteBuiltInOutput:
     case BuilderOpcode::WriteGenericOutput:
     case BuilderOpcode::ImageBvhIntersectRay:
-      // TODO: These functions have not been classified yet.
+      // TODO: These functions have not been classified yet, other than that we want to mark
+      // as NoDivergenceSource.
+#if !LLVM_MAIN_REVISION || LLVM_MAIN_REVISION >= 514862
+      func->addFnAttr(Attribute::NoDivergenceSource);
+#endif
       break;
     default:
       llvm_unreachable("Should never be called!");
diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp
index 7e7aa56afb..df9123c523 100644
--- a/lgc/builder/ImageBuilder.cpp
+++ b/lgc/builder/ImageBuilder.cpp
@@ -2117,12 +2117,17 @@ void BuilderImpl::enforceReadFirstLane(Instruction *imageInst, unsigned descIdx)
   InsertPointGuard guard(*this);
   SetInsertPoint(imageInst);
   Value *origDesc = imageInst->getOperand(descIdx);
-  const unsigned elemCount = cast<FixedVectorType>(origDesc->getType())->getNumElements();
-  Value *newDesc = PoisonValue::get(FixedVectorType::get(getInt32Ty(), elemCount));
-  for (unsigned elemIdx = 0; elemIdx < elemCount; ++elemIdx) {
-    Value *elem = CreateExtractElement(origDesc, elemIdx);
-    elem = CreateIntrinsic(getInt32Ty(), Intrinsic::amdgcn_readfirstlane, elem);
-    newDesc = CreateInsertElement(newDesc, elem, elemIdx);
+  Value *newDesc;
+  if (isReadFirstLaneTypeSupported(origDesc->getType())) {
+    newDesc = CreateUnaryIntrinsic(Intrinsic::amdgcn_readfirstlane, origDesc);
+  } else {
+    const unsigned elemCount = cast<FixedVectorType>(origDesc->getType())->getNumElements();
+    newDesc = PoisonValue::get(FixedVectorType::get(getInt32Ty(), elemCount));
+    for (unsigned elemIdx = 0; elemIdx < elemCount; ++elemIdx) {
+      Value *elem = CreateExtractElement(origDesc, elemIdx);
+      elem = CreateIntrinsic(getInt32Ty(), Intrinsic::amdgcn_readfirstlane, elem);
+      newDesc = CreateInsertElement(newDesc, elem, elemIdx);
+    }
   }
   imageInst->setOperand(descIdx, newDesc);
 }
diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp
index 44439f30a6..6142c7c146 100644
--- a/lgc/builder/InOutBuilder.cpp
+++ b/lgc/builder/InOutBuilder.cpp
@@ -1479,7 +1479,7 @@ Value *BuilderImpl::readCsBuiltIn(BuiltInKind builtIn, const Twine &instName) {
   default:
     // Not handled. This should never happen; we need to handle all CS built-ins here because the old way of
     // handling them (caller will handle with lgc.input.import.builtin, which is then lowered in
-    // PatchInOutImportExport) does not work with compute-with-calls.
+    // LowerInOut) does not work with compute-with-calls.
     llvm_unreachable("Unhandled CS built-in");
     return nullptr;
   }
@@ -1513,7 +1513,7 @@ Value *BuilderImpl::readVsBuiltIn(BuiltInKind builtIn, const Twine &instName) {
   case BuiltInInstanceId:
     return ShaderInputs::getInput(ShaderInput::InstanceId, builder, *getLgcContext());
   default:
-    // Not handled; caller will handle with lgc.input.import.builtin, which is then lowered in PatchInOutImportExport.
+    // Not handled; caller will handle with lgc.input.import.builtin, which is then lowered in LowerInOut.
     return nullptr;
   }
 }
@@ -1622,7 +1622,7 @@ Type *BuilderImpl::getBuiltInTy(BuiltInKind builtIn, InOutInfo inOutInfo) {
 
 // =====================================================================================================================
 // Mark usage of a built-in input. This is only needed where a built-in is handled by generating lgc.import.input
-// to be lowered in PatchInOutImportExport, and not when it is directly generated here using
+// to be lowered in LowerInOut, and not when it is directly generated here using
 // ShaderInputs::getInput() and/or ShaderInputs::getSpecialUserData().
 //
 // @param builtIn : Built-in ID
diff --git a/lgc/builder/MatrixBuilder.cpp b/lgc/builder/MatrixBuilder.cpp
index 1ad7306101..742f3ec9ea 100644
--- a/lgc/builder/MatrixBuilder.cpp
+++ b/lgc/builder/MatrixBuilder.cpp
@@ -366,6 +366,8 @@ Type *BuilderCommon::transCooperativeMatrixElementType(CooperativeMatrixElementT
   case CooperativeMatrixElementType::Float8:
   case CooperativeMatrixElementType::BFloat8:
     return getInt8Ty();
+  case CooperativeMatrixElementType::Int4:
+    return getIntNTy(4);
   default:
     llvm_unreachable("The element type is not supported.");
   }
@@ -381,12 +383,16 @@ Type *BuilderCommon::getCooperativeMatrixTy(CooperativeMatrixElementType elemTyp
   // types at the LGC level, and parameterize the type using both the element type and the layout.
 
   Type *wordTy = transCooperativeMatrixElementType(elemType)->isIntOrIntVectorTy() ? getInt32Ty() : getFloatTy();
+  unsigned nDwords = 0;
+  (void)(nDwords);
   switch (layout) {
   case CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout:
   case CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout:
   case CooperativeMatrixLayout::AccumulatorMatrixLayout:
     return FixedVectorType::get(wordTy, 8);
   case CooperativeMatrixLayout::FactorMatrixLayout:
+    if (elemType == CooperativeMatrixElementType::Int4)
+      return FixedVectorType::get(wordTy, 2);
     if (elemType == CooperativeMatrixElementType::Int8)
       return FixedVectorType::get(wordTy, 4);
     return FixedVectorType::get(wordTy, 8);
@@ -396,29 +402,36 @@ Type *BuilderCommon::getCooperativeMatrixTy(CooperativeMatrixElementType elemTyp
 }
 
 // =====================================================================================================================
-// Whether the type of a cooperative matrix is specified bit width.
+// Get the bit width of the cooperativeMatrix element type
 //
 // @param elemType : the matrix element type
-// @param bitWidth : the specified bit width
-bool BuilderCommon::isTypeNCooperativeMatrix(CooperativeMatrixElementType elemType, unsigned bitWidth) {
-  unsigned width = 0;
+unsigned BuilderCommon::getBitWidthOfCooperativeMatrixElement(CooperativeMatrixElementType elemType) {
   switch (elemType) {
   case lgc::CooperativeMatrixElementType::Float16:
+  case lgc::CooperativeMatrixElementType::Float16Packed:
   case lgc::CooperativeMatrixElementType::BFloat16:
   case lgc::CooperativeMatrixElementType::Int16:
-    width = 16;
-    break;
+    return 16;
   case lgc::CooperativeMatrixElementType::Float32:
   case lgc::CooperativeMatrixElementType::Int32:
-    width = 32;
-    break;
+    return 32;
   case lgc::CooperativeMatrixElementType::Int8:
   case lgc::CooperativeMatrixElementType::Float8:
   case lgc::CooperativeMatrixElementType::BFloat8:
-    width = 8;
-    break;
+    return 8;
+  case lgc::CooperativeMatrixElementType::Int4:
+    return 4;
   default:
-    break;
+    llvm_unreachable("Type is not supported!");
   }
+}
+
+// =====================================================================================================================
+// Whether the type of a cooperative matrix is specified bit width.
+//
+// @param elemType : the matrix element type
+// @param bitWidth : the specified bit width
+bool BuilderCommon::isTypeNCooperativeMatrix(CooperativeMatrixElementType elemType, unsigned bitWidth) {
+  unsigned width = getBitWidthOfCooperativeMatrixElement(elemType);
   return width == bitWidth;
 }
diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp
index 598097433e..758129281a 100644
--- a/lgc/builder/SubgroupBuilder.cpp
+++ b/lgc/builder/SubgroupBuilder.cpp
@@ -80,33 +80,26 @@ unsigned BuilderImpl::getShaderWaveSize() {
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupElect(const Twine &instName) {
   auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
-  return CreateICmpEQ(CreateSubgroupMbcnt(createGroupBallot(getTrue(), shaderStage.value())), getInt32(0));
+  const auto state = SubgroupHelperLaneState::get(shaderStage.value(), m_pipelineState);
+  return CreateICmpEQ(CreateSubgroupMbcnt(createGroupBallot(state, getTrue())), getInt32(0));
 }
 
 // =====================================================================================================================
 // Create a subgroup all call.
 //
+// @param state : The subgroup helper lane state
 // @param value : The value to compare across the subgroup. Must be an integer type.
-// @param shaderStage : shader stage enum.
 // @param instName : Name to give final instruction.
-Value *SubgroupBuilder::createSubgroupAll(Value *const value, ShaderStageEnum shaderStage, const Twine &instName) {
-  bool includeHelperLanes = false;
-  bool requireHelperLanes = false;
-
-  if (shaderStage == ShaderStage::Fragment) {
-    const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-    includeHelperLanes = !fragmentMode.waveOpsExcludeHelperLanes;
-    requireHelperLanes = fragmentMode.waveOpsRequireHelperLanes;
-  }
-
-  Value *result = CreateICmpEQ(createGroupBallot(value, shaderStage), createGroupBallot(getTrue(), shaderStage));
+Value *SubgroupBuilder::createSubgroupAll(const SubgroupHelperLaneState &state, Value *const value,
+                                          const Twine &instName) {
+  Value *result = CreateICmpEQ(createGroupBallot(state, value), createGroupBallot(state, getTrue()));
   result = CreateSelect(CreateUnaryIntrinsic(Intrinsic::is_constant, value), value, result);
 
   // Helper invocations of whole quad mode should be included in the subgroup vote execution
-  if (includeHelperLanes) {
+  if (state.included()) {
     result = CreateZExt(result, getInt32Ty());
-    result = CreateIntrinsic(requireHelperLanes ? Intrinsic::amdgcn_wqm : Intrinsic::amdgcn_softwqm, {getInt32Ty()},
-                             {result});
+    result =
+        CreateIntrinsic(state.required() ? Intrinsic::amdgcn_wqm : Intrinsic::amdgcn_softwqm, {getInt32Ty()}, {result});
     result = CreateTrunc(result, getInt1Ty());
   }
   return result;
@@ -118,25 +111,16 @@ Value *SubgroupBuilder::createSubgroupAll(Value *const value, ShaderStageEnum sh
 // @param value : The value to compare across the subgroup. Must be an integer type.
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupAny(Value *const value, const Twine &instName) {
-  auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
-
-  bool includeHelperLanes = false;
-  bool requireHelperLanes = false;
-
-  if (getShaderStage(GetInsertBlock()->getParent()).value() == ShaderStage::Fragment) {
-    const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-    includeHelperLanes = !fragmentMode.waveOpsExcludeHelperLanes;
-    requireHelperLanes = fragmentMode.waveOpsRequireHelperLanes;
-  }
-
-  Value *result = CreateICmpNE(createGroupBallot(value, shaderStage.value()), getInt64(0));
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+  Value *result = CreateICmpNE(createGroupBallot(state, value), getInt64(0));
   result = CreateSelect(CreateUnaryIntrinsic(Intrinsic::is_constant, value), value, result);
 
   // Helper invocations of whole quad mode should be included in the subgroup vote execution
-  if (includeHelperLanes) {
+  if (state.included()) {
     result = CreateZExt(result, getInt32Ty());
-    result = CreateIntrinsic(requireHelperLanes ? Intrinsic::amdgcn_wqm : Intrinsic::amdgcn_softwqm, {getInt32Ty()},
-                             {result});
+    result =
+        CreateIntrinsic(state.required() ? Intrinsic::amdgcn_wqm : Intrinsic::amdgcn_softwqm, {getInt32Ty()}, {result});
     result = CreateTrunc(result, getInt1Ty());
   }
   return result;
@@ -148,11 +132,12 @@ Value *SubgroupBuilder::CreateSubgroupAny(Value *const value, const Twine &instN
 // @param value : The value to compare across the subgroup. Must be an integer type.
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupAllEqual(Value *const value, const Twine &instName) {
-  auto shaderStage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
 
   Type *const type = value->getType();
 
-  Value *compare = createSubgroupBroadcastFirst(value, shaderStage, instName);
+  Value *compare = createSubgroupBroadcastFirst(state, value, instName);
 
   if (type->isFPOrFPVectorTy())
     compare = CreateFCmpOEQ(compare, value);
@@ -167,9 +152,9 @@ Value *SubgroupBuilder::CreateSubgroupAllEqual(Value *const value, const Twine &
     for (unsigned i = 1, compCount = cast<FixedVectorType>(type)->getNumElements(); i < compCount; i++)
       result = CreateAnd(result, CreateExtractElement(compare, i));
 
-    return createSubgroupAll(result, shaderStage, instName);
+    return createSubgroupAll(state, result, instName);
   }
-  return createSubgroupAll(compare, shaderStage, instName);
+  return createSubgroupAll(state, compare, instName);
 }
 
 // =====================================================================================================================
@@ -181,8 +166,6 @@ Value *SubgroupBuilder::CreateSubgroupAllEqual(Value *const value, const Twine &
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupRotate(Value *const value, Value *const delta, Value *const clusterSize,
                                              const Twine &instName) {
-  auto shaderStage = getShaderStage(GetInsertBlock()->getParent()).value();
-
   // LocalId = SubgroupLocalInvocationId
   // RotationGroupSize = hasClusterSIze? ClusterSize : SubgroupSize.
   // Invocation ID = ((LocalId + Delta) & (RotationGroupSize - 1)) + (LocalId & ~(RotationGroupSize - 1))
@@ -193,8 +176,7 @@ Value *SubgroupBuilder::CreateSubgroupRotate(Value *const value, Value *const de
     invocationId =
         CreateOr(CreateAnd(invocationId, rotationGroupSize), CreateAnd(localId, CreateNot(rotationGroupSize)));
   }
-
-  return createSubgroupShuffle(value, invocationId, shaderStage, instName);
+  return CreateSubgroupShuffle(value, invocationId, instName);
 }
 
 // =====================================================================================================================
@@ -231,15 +213,14 @@ Value *BuilderImpl::CreateSubgroupBroadcastWaterfall(Value *const value, Value *
 // =====================================================================================================================
 // Create a subgroup broadcastfirst call.
 //
+// @param state : The subgroup helper lane state
 // @param value : The value to read from the first active lane into all other active lanes.
-// @param shaderStage : shader stage enum.
 // @param instName : Name to give final instruction.
-Value *BuilderImpl::createSubgroupBroadcastFirst(Value *const value, ShaderStageEnum shaderStage,
+Value *BuilderImpl::createSubgroupBroadcastFirst(const SubgroupHelperLaneState &state, Value *const value,
                                                  const Twine &instName) {
-  // For waveOpsExcludeHelperLanes mode, we need filter out the helperlane and use readlane instead.
-  if (shaderStage == ShaderStage::Fragment &&
-      m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsExcludeHelperLanes) {
-    Value *ballot = createGroupBallot(getTrue(), shaderStage);
+  // We need filter out the helperlane and use readlane instead if don't care helper lanes.
+  if (state.excluded()) {
+    Value *ballot = createGroupBallot(state, getTrue());
     Value *firstlane = CreateIntrinsic(Intrinsic::cttz, getInt64Ty(), {ballot, getTrue()});
     firstlane = CreateTrunc(firstlane, getInt32Ty());
 
@@ -268,7 +249,9 @@ Value *BuilderImpl::CreateSubgroupBallot(Value *const value, const Twine &instNa
   // Check the type is definitely an integer.
   assert(value->getType()->isIntegerTy());
 
-  Value *ballot = createGroupBallot(value);
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+  Value *ballot = createGroupBallot(state, value);
 
   // Ballot expects a <4 x i32> return, so we need to turn the i64 into that.
   ballot = CreateBitCast(ballot, FixedVectorType::get(getInt32Ty(), 2));
@@ -384,12 +367,13 @@ Value *BuilderImpl::CreateSubgroupBallotFindMsb(Value *const value, const Twine
 // =====================================================================================================================
 // Create a subgroup shuffle call.
 //
+// @param state : The subgroup helper lane state
 // @param value : The value to shuffle.
 // @param index : The index to shuffle from.
 // @param shaderStage : shader stage enum.
 // @param instName : Name to give final instruction.
-Value *BuilderImpl::createSubgroupShuffle(Value *const value, Value *const index, ShaderStageEnum shaderStage,
-                                          const Twine &instName) {
+Value *BuilderImpl::createSubgroupShuffle(const SubgroupHelperLaneState &state, Value *const value, Value *const index,
+                                          ShaderStageEnum shaderStage, const Twine &instName) {
 
   if (supportWaveWideBPermute(shaderStage)) {
     auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
@@ -435,14 +419,13 @@ Value *BuilderImpl::createSubgroupShuffle(Value *const value, Value *const index
     auto result = CreateSelect(indexInSameHalf, bPermSameHalf, bPermOtherHalf);
 
     // If required, force inputs of the operation to be computed in WQM.
-    if (shaderStage == ShaderStage::Fragment &&
-        m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsRequireHelperLanes)
-      result = createWqm(result, shaderStage);
+    if (state.required())
+      result = createWqm(result);
 
     return result;
   }
 
-  return createShuffleLoop(value, index, shaderStage);
+  return createShuffleLoop(state, value, index);
 }
 
 // =====================================================================================================================
@@ -591,8 +574,9 @@ Value *BuilderImpl::CreateSubgroupClusteredReduction(GroupArithOp groupArithOp,
   Value *result = BuilderBase::get(*this).CreateSetInactive(value, identity);
 
   // For waveOpsExcludeHelperLanes mode, we need mask away the helperlane.
-  const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-  if (m_shaderStage == ShaderStage::Fragment && fragmentMode.waveOpsExcludeHelperLanes) {
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+  if (state.excluded()) {
     auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {});
     result = CreateSelect(isLive, result, identity);
   }
@@ -647,8 +631,7 @@ Value *BuilderImpl::CreateSubgroupClusteredReduction(GroupArithOp groupArithOp,
   result = createWwm(result);
 
   // If required, force inputs of the operation to be computed in WQM.
-  if (m_shaderStage == ShaderStage::Fragment &&
-      m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsRequireHelperLanes)
+  if (state.required())
     result = createWqm(result);
 
   return result;
@@ -720,8 +703,9 @@ Value *BuilderImpl::CreateSubgroupClusteredInclusive(GroupArithOp groupArithOp,
   result = createWwm(result);
 
   // If required, force inputs of the operation to be computed in WQM.
-  if (m_shaderStage == ShaderStage::Fragment &&
-      m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsRequireHelperLanes)
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+  if (state.required())
     result = createWqm(result);
 
   return result;
@@ -748,8 +732,9 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp,
   Value *result = BuilderBase::get(*this).CreateSetInactive(value, identity);
 
   // For waveOpsExcludeHelperLanes mode, we need mask away the helperlane.
-  const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-  if (m_shaderStage == ShaderStage::Fragment && fragmentMode.waveOpsExcludeHelperLanes) {
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+  if (state.excluded()) {
     auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {});
     result = CreateSelect(isLive, result, identity);
   }
@@ -825,8 +810,7 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp,
   result = createWwm(result);
 
   // If required, force inputs of the operation to be computed in WQM.
-  if (m_shaderStage == ShaderStage::Fragment &&
-      m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsRequireHelperLanes)
+  if (state.required())
     result = createWqm(result);
 
   return result;
@@ -842,6 +826,8 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp,
 Value *BuilderImpl::CreateSubgroupClusteredMultiExclusive(GroupArithOp groupArithOp, Value *const value,
                                                           Value *const mask, const Twine &instName) {
   Value *const identity = createGroupArithmeticIdentity(groupArithOp, value->getType());
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
 
   Value *laneIndex = CreateGetLaneNumber();
   Value *clusterMask =
@@ -853,8 +839,7 @@ Value *BuilderImpl::CreateSubgroupClusteredMultiExclusive(GroupArithOp groupArit
   Value *result = value;
 
   // For waveOpsExcludeHelperLanes mode, we need mask away the helperlane.
-  const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-  if (m_shaderStage == ShaderStage::Fragment && fragmentMode.waveOpsExcludeHelperLanes) {
+  if (state.excluded()) {
     auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {});
     result = CreateSelect(isLive, result, identity);
   }
@@ -866,7 +851,8 @@ Value *BuilderImpl::CreateSubgroupClusteredMultiExclusive(GroupArithOp groupArit
   Value *preLaneMask = CreateSub(CreateShl(constOne, CreateZExtOrTrunc(laneIndex, clusterMask->getType())), constOne);
   Value *checkMask = CreateAnd(preLaneMask, clusterMask);
 
-  Value *preLaneValue = CreateSubgroupShuffle(result, createFindMsb(checkMask), instName);
+  Value *preLaneValue = createSubgroupShuffle(SubgroupHelperLaneState::get(std::nullopt, state.requireHelperLanes),
+                                              result, createFindMsb(checkMask), m_shaderStage.value(), instName);
 
   result = CreateSelect(CreateICmpNE(checkMask, constZero), preLaneValue, identity);
 
@@ -882,7 +868,10 @@ Value *BuilderImpl::CreateSubgroupClusteredMultiExclusive(GroupArithOp groupArit
     Value *isPreviousLaneValid = CreateICmpNE(preClusterMask, constZero);
     Value *previousLaneIndex = createFindMsb(preClusterMask);
     Value *previousLaneValue = nullptr;
-    { previousLaneValue = CreateSubgroupShuffle(result, previousLaneIndex, instName); }
+    {
+      previousLaneValue = createSubgroupShuffle(SubgroupHelperLaneState::get(std::nullopt, state.requireHelperLanes),
+                                                result, previousLaneIndex, m_shaderStage.value(), instName);
+    }
 
     // Don't accumulate if there is no valid lane found in previous cluster or current lane is no need for accumulate.
     // TODO: Check amdgcn_inverse_ballot version.
@@ -1040,6 +1029,8 @@ Value *BuilderImpl::CreateSubgroupMbcnt(Value *const mask, const Twine &instName
 Value *BuilderImpl::CreateSubgroupPartition(llvm::Value *const value, const Twine &instName) {
   BasicBlock *currentBlock = GetInsertBlock();
   auto insertPoint = GetInsertPoint();
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
 
   BasicBlock *beforeBlock =
       splitBlockBefore(currentBlock, &*insertPoint, nullptr, nullptr, nullptr, currentBlock->getName());
@@ -1061,9 +1052,9 @@ Value *BuilderImpl::CreateSubgroupPartition(llvm::Value *const value, const Twin
   // remove the default br generated by splitBlockBefore.
   loopBlock->getTerminator()->eraseFromParent();
   SetInsertPoint(loopBlock);
-  Value *laneValue = CreateSubgroupBroadcastFirst(targetValue);
+  Value *laneValue = createSubgroupBroadcastFirst(state, targetValue, instName);
   Value *isEqual = CreateICmpEQ(laneValue, targetValue);
-  Value *mask = createGroupBallot(isEqual);
+  Value *mask = createGroupBallot(state, isEqual);
   CreateCondBr(isEqual, loopEndBlock, loopBlock);
 
   // Handle Loop End
@@ -1332,13 +1323,12 @@ Value *BuilderImpl::createWwm(Value *const value) {
 // Only in fragment shader stage.
 //
 // @param value : The value to pass to the soft WQM call.
-// @param shaderStage : shader stage enum.
-Value *BuilderImpl::createWqm(Value *const value, ShaderStageEnum shaderStage) {
+Value *BuilderImpl::createWqm(Value *const value) {
   auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs, ArrayRef<Value *>) -> Value * {
     return builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_wqm, mappedArgs[0]);
   };
 
-  if (shaderStage == ShaderStage::Fragment)
+  if (m_shaderStage.value() == ShaderStage::Fragment)
     return CreateMapToSimpleType(mapFunc, value, {});
 
   return value;
@@ -1412,15 +1402,18 @@ Value *BuilderImpl::createInverseBallotSelect(uint64_t selectMask, Value *const
 }
 
 // =====================================================================================================================
-// Do group ballot with all active threads participated, turning a boolean value (in a VGPR) into a subgroup-wide
-// shared SGPR.
+// Do group ballot, turning a per-lane boolean value (in a VGPR) into a subgroup-wide shared SGPR.
 //
+// @param state : The subgroup helper lane state
 // @param value : The value to contribute to the SGPR, must be an boolean type.
-Value *BuilderImpl::createGroupBallotAllActive(Value *const value) {
+Value *BuilderImpl::createGroupBallot(const SubgroupHelperLaneState &state, Value *const value) {
   // Check the type is definitely an boolean.
   assert(value->getType()->isIntegerTy(1));
 
   Value *result = value;
+  if (state.excluded())
+    result = CreateAnd(CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {}), result);
+
   unsigned waveSize = getShaderWaveSize();
   result = CreateIntrinsic(getIntNTy(waveSize), Intrinsic::amdgcn_ballot, result);
 
@@ -1431,34 +1424,6 @@ Value *BuilderImpl::createGroupBallotAllActive(Value *const value) {
   return result;
 }
 
-// =====================================================================================================================
-// Do group ballot, turning a per-lane boolean value (in a VGPR) into a subgroup-wide shared SGPR.
-//
-// @param value : The value to contribute to the SGPR, must be an boolean type.
-// @param shaderStage : shader stage enum.
-Value *BuilderImpl::createGroupBallot(Value *const value, ShaderStageEnum shaderStage) {
-  // Check the type is definitely an boolean.
-  assert(value->getType()->isIntegerTy(1));
-
-  Value *result = value;
-
-  // For waveOpsExcludeHelperLanes mode, we need mask away the helperlane.
-  if (shaderStage == ShaderStage::Fragment &&
-      m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsExcludeHelperLanes) {
-    auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {});
-    result = CreateAnd(isLive, result);
-  }
-  return createGroupBallotAllActive(result);
-}
-
-// =====================================================================================================================
-// Do group ballot, turning a per-lane boolean value (in a VGPR) into a subgroup-wide shared SGPR.
-//
-// @param value : The value to contribute to the SGPR, must be an boolean type.
-Value *BuilderImpl::createGroupBallot(Value *const value) {
-  return createGroupBallot(value, m_shaderStage.value());
-}
-
 // =====================================================================================================================
 // Create a traditional loop for subgroup shuffle.
 //
@@ -1484,11 +1449,12 @@ Value *BuilderImpl::createGroupBallot(Value *const value) {
 // }
 // while (workList != 0)
 //
+// @param state : The subgroup helper lane state
 // @param value : The value to shuffle.
 // @param index : The index to shuffle from.
 // @param instName : Name to give instruction(s)
-llvm::Value *BuilderImpl::createShuffleLoop(llvm::Value *const value, llvm::Value *const index,
-                                            ShaderStageEnum shaderStage, const llvm::Twine &instName) {
+llvm::Value *BuilderImpl::createShuffleLoop(const SubgroupHelperLaneState &state, llvm::Value *const value,
+                                            llvm::Value *const index, const llvm::Twine &instName) {
   assert(value != nullptr && index != nullptr);
   // Return readlane directly, if the index is a constant value.
   if (isa<Constant>(index))
@@ -1498,12 +1464,7 @@ llvm::Value *BuilderImpl::createShuffleLoop(llvm::Value *const value, llvm::Valu
   // By implementation, the Insert point has been set to the callInst when call processCall
   auto *loopPoint = &*(GetInsertPoint());
   auto *originalBlock = loopPoint->getParent();
-
-  // We are forcing all active threads participate the shuffle because CreateSubgroupClusteredMultiExclusive()
-  // depends on this to be correct.
-  // TODO: Refine the code or algorithm so that createShuffleLoop is no longer affected by external code
-  // implementations.
-  auto *workList = createGroupBallotAllActive(getTrue());
+  auto *workList = createGroupBallot(state, getTrue());
 
   // Init loop block.
   auto *loop = originalBlock->splitBasicBlock(loopPoint, ".shuffleLoop");
@@ -1530,7 +1491,7 @@ llvm::Value *BuilderImpl::createShuffleLoop(llvm::Value *const value, llvm::Valu
     return builder.CreateSelect(passthroughArgs[1], result, value);
   };
   auto result = CreateMapToSimpleType(mapFunc, {resultPhi, value}, {currentSrcLaneIndex, notCurrentLane});
-  auto newWorkList = CreateAnd(createGroupBallotAllActive(notCurrentLane), workListPhi);
+  auto newWorkList = CreateAnd(createGroupBallot(state, notCurrentLane), workListPhi);
   resultPhi->addIncoming(result, loop);
   workListPhi->addIncoming(newWorkList, loop);
   auto *cond = CreateICmpEQ(newWorkList, ConstantInt::get(waveSize, 0));
@@ -1562,7 +1523,10 @@ Value *BuilderImpl::createFindMsb(Value *const mask) {
 // @param requireFullQuads : Identify whether it's in wqm.
 // @param instName : Name to give final instruction.
 Value *BuilderImpl::CreateQuadBallot(Value *const value, bool requireFullQuads, const Twine &instName) {
-  Value *ballotValue = createGroupBallot(value);
+  const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+  const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+
+  Value *ballotValue = createGroupBallot(state, value);
 
   // Get the 1st thread_id in the quad
   Value *threadId = CreateSubgroupMbcnt(getInt64(UINT64_MAX), "");
diff --git a/lgc/elfLinker/ColorExportShader.cpp b/lgc/elfLinker/ColorExportShader.cpp
index c2a8ca7a1b..d30622602e 100644
--- a/lgc/elfLinker/ColorExportShader.cpp
+++ b/lgc/elfLinker/ColorExportShader.cpp
@@ -31,7 +31,7 @@
  */
 
 #include "ColorExportShader.h"
-#include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/FragmentColorExport.h"
 #include "lgc/state/TargetInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lgc/elfLinker/ColorExportShader.h b/lgc/elfLinker/ColorExportShader.h
index 836a1d485b..8e013def42 100644
--- a/lgc/elfLinker/ColorExportShader.h
+++ b/lgc/elfLinker/ColorExportShader.h
@@ -32,7 +32,7 @@
 #pragma once
 
 #include "GlueShader.h"
-#include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/FragmentColorExport.h"
 #include "lgc/state/PalMetadata.h"
 #include "lgc/state/PipelineState.h"
 
diff --git a/lgc/elfLinker/NullFragmentShader.cpp b/lgc/elfLinker/NullFragmentShader.cpp
index 9d763dead8..e4b26ee188 100644
--- a/lgc/elfLinker/NullFragmentShader.cpp
+++ b/lgc/elfLinker/NullFragmentShader.cpp
@@ -30,7 +30,7 @@
  */
 
 #include "NullFragmentShader.h"
-#include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/FragmentColorExport.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/TargetInfo.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h
index 7603c29cf8..92095f5c89 100644
--- a/lgc/include/lgc/builder/BuilderImpl.h
+++ b/lgc/include/lgc/builder/BuilderImpl.h
@@ -648,6 +648,33 @@ class BuilderImpl : public BuilderDefs {
 
   // -------------------------------------------------------------------------------------------------------------------
   // Builder implementation subclass for subgroup operations
+protected:
+  struct SubgroupHelperLaneState {
+    std::optional<bool> excludeHelperLanes;
+    std::optional<bool> requireHelperLanes;
+
+    bool excluded() const { return excludeHelperLanes && *excludeHelperLanes; }
+    bool included() const { return excludeHelperLanes && !*excludeHelperLanes; }
+    bool required() const { return requireHelperLanes && *requireHelperLanes; }
+
+    static SubgroupHelperLaneState get(std::optional<bool> exclude = std::nullopt,
+                                       std::optional<bool> require = std::nullopt) {
+      return SubgroupHelperLaneState{
+          .excludeHelperLanes = exclude,
+          .requireHelperLanes = require,
+      };
+    }
+    static SubgroupHelperLaneState get(ShaderStageEnum stage, PipelineState *const pipelineState) {
+      if (stage != ShaderStage::Fragment)
+        return SubgroupHelperLaneState::get();
+      const auto &fragmentMode = pipelineState->getShaderModes()->getFragmentShaderMode();
+      return SubgroupHelperLaneState{
+          .excludeHelperLanes = !!fragmentMode.waveOpsExcludeHelperLanes,
+          .requireHelperLanes = !!fragmentMode.waveOpsRequireHelperLanes,
+      };
+    }
+  };
+
 public:
   // Create a get wave size query.
   llvm::Value *CreateGetWaveSize(const llvm::Twine &instName = "");
@@ -665,7 +692,9 @@ class BuilderImpl : public BuilderDefs {
 
   // Create a subgroup broadcast first.
   llvm::Value *CreateSubgroupBroadcastFirst(llvm::Value *const value, const llvm::Twine &instName = "") {
-    return createSubgroupBroadcastFirst(value, m_shaderStage.value(), instName);
+    const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+    const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+    return createSubgroupBroadcastFirst(state, value, instName);
   }
 
   // Create a subgroup ballot.
@@ -696,7 +725,9 @@ class BuilderImpl : public BuilderDefs {
   // Create a subgroup shuffle.
   llvm::Value *CreateSubgroupShuffle(llvm::Value *const value, llvm::Value *const index,
                                      const llvm::Twine &instName = "") {
-    return createSubgroupShuffle(value, index, m_shaderStage.value(), instName);
+    const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+    const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+    return createSubgroupShuffle(state, value, index, stage, instName);
   }
 
   // Create a subgroup shuffle xor.
@@ -786,7 +817,6 @@ class BuilderImpl : public BuilderDefs {
 
   llvm::Value *createDsSwizzle(llvm::Value *const value, uint16_t dsPattern);
   llvm::Value *createWwm(llvm::Value *const value);
-  llvm::Value *createWqm(llvm::Value *const value) { return createWqm(value, m_shaderStage.value()); }
   llvm::Value *createThreadMask();
   llvm::Value *createThreadMaskedSelect(llvm::Value *const threadMask, uint64_t andMask, llvm::Value *const value1,
                                         llvm::Value *const value2);
@@ -794,21 +824,19 @@ class BuilderImpl : public BuilderDefs {
   uint16_t getDsSwizzleBitMode(uint8_t xorMask, uint8_t orMask, uint8_t andMask);
   uint16_t getDsSwizzleQuadMode(uint8_t lane0, uint8_t lane1, uint8_t lane2, uint8_t lane3);
 
-  llvm::Value *createGroupBallot(llvm::Value *const value);
-  // Create a traditional loop for subgroup shuffle.
-  llvm::Value *createShuffleLoop(llvm::Value *const value, llvm::Value *const index, ShaderStageEnum shaderStage,
-                                 const llvm::Twine &instName = "");
-
 protected:
+  llvm::Value *createGroupBallot(const SubgroupHelperLaneState &state, llvm::Value *const value);
+  // Create a traditional loop for subgroup shuffle.
+  llvm::Value *createShuffleLoop(const SubgroupHelperLaneState &state, llvm::Value *const value,
+                                 llvm::Value *const index, const llvm::Twine &instName = "");
   // The subgroup operation with explicit shader stage as parameter.
   llvm::Value *createFindMsb(llvm::Value *const mask);
-  llvm::Value *createGroupBallotAllActive(llvm::Value *const value);
-  llvm::Value *createGroupBallot(llvm::Value *const value, ShaderStageEnum shaderStage);
-  llvm::Value *createSubgroupBroadcastFirst(llvm::Value *const value, ShaderStageEnum shaderStage,
+  llvm::Value *createSubgroupBroadcastFirst(const SubgroupHelperLaneState &status, llvm::Value *const value,
                                             const llvm::Twine &instName);
-  llvm::Value *createSubgroupShuffle(llvm::Value *const value, llvm::Value *const index, ShaderStageEnum shaderStage,
+  llvm::Value *createSubgroupShuffle(const SubgroupHelperLaneState &status, llvm::Value *const value,
+                                     llvm::Value *const index, ShaderStageEnum shaderStage,
                                      const llvm::Twine &instName);
-  llvm::Value *createWqm(llvm::Value *const value, ShaderStageEnum shaderStage);
+  llvm::Value *createWqm(llvm::Value *const value);
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/builder/SubgroupBuilder.h b/lgc/include/lgc/builder/SubgroupBuilder.h
index 6497951021..5934543ac7 100644
--- a/lgc/include/lgc/builder/SubgroupBuilder.h
+++ b/lgc/include/lgc/builder/SubgroupBuilder.h
@@ -60,7 +60,9 @@ class SubgroupBuilder : public BuilderImpl {
   // @param value : The value to compare
   // @param instName : Name to give instruction(s)
   llvm::Value *CreateSubgroupAll(llvm::Value *const value, const llvm::Twine &instName = "") {
-    return createSubgroupAll(value, getShaderStage(GetInsertBlock()->getParent()).value(), instName);
+    const auto stage = getShaderStage(GetInsertBlock()->getParent()).value();
+    const auto state = SubgroupHelperLaneState::get(stage, m_pipelineState);
+    return createSubgroupAll(state, value, instName);
   }
 
   // Create a subgroup all equal.
@@ -84,7 +86,8 @@ class SubgroupBuilder : public BuilderImpl {
   SubgroupBuilder &operator=(const SubgroupBuilder &) = delete;
 
   // The subgroup operation with explicit shader stage as parameter.
-  llvm::Value *createSubgroupAll(llvm::Value *const value, ShaderStageEnum shaderStage, const llvm::Twine &instName);
+  llvm::Value *createSubgroupAll(const SubgroupHelperLaneState &state, llvm::Value *const value,
+                                 const llvm::Twine &instName);
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/patch/AddLoopMetadata.h b/lgc/include/lgc/patch/AddLoopMetadata.h
index afe46f16ca..ecd0d69796 100644
--- a/lgc/include/lgc/patch/AddLoopMetadata.h
+++ b/lgc/include/lgc/patch/AddLoopMetadata.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  AddLoopMetadata.h
- * @brief LLPC header file: contains declaration of class lgc::PatchLoopMetadata.
+ * @brief LLPC header file: contains declaration of class lgc::AddLoopMetadata.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -39,9 +39,9 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the LLVM pass for patching loop metadata.
-class PatchLoopMetadata : public llvm::PassInfoMixin<PatchLoopMetadata> {
+class AddLoopMetadata : public llvm::PassInfoMixin<AddLoopMetadata> {
 public:
-  PatchLoopMetadata();
+  AddLoopMetadata();
   llvm::PreservedAnalyses run(llvm::Loop &loop, llvm::LoopAnalysisManager &analysisManager,
                               llvm::LoopStandardAnalysisResults &loopAnalysisResults, llvm::LPMUpdater &);
 
diff --git a/lgc/include/lgc/patch/PatchWorkarounds.h b/lgc/include/lgc/patch/ApplyWorkarounds.h
similarity index 99%
rename from lgc/include/lgc/patch/PatchWorkarounds.h
rename to lgc/include/lgc/patch/ApplyWorkarounds.h
index 670872fbcb..f7777974af 100644
--- a/lgc/include/lgc/patch/PatchWorkarounds.h
+++ b/lgc/include/lgc/patch/ApplyWorkarounds.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchWorkarounds.h
+ * @file  ApplyWorkarounds.h
  * @brief LLPC header file: contains declaration of class lgc::PatchWorkarounds.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/CollectImageOperations.h b/lgc/include/lgc/patch/CollectImageOperations.h
index d4f563f37b..acc1add82d 100644
--- a/lgc/include/lgc/patch/CollectImageOperations.h
+++ b/lgc/include/lgc/patch/CollectImageOperations.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  CollectImageOperations.h
- * @brief LLPC header file: contains declaration of class lgc::PatchImageOpCollect.
+ * @brief LLPC header file: contains declaration of class lgc::CollectImageOperations.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -38,7 +38,7 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the pass of LLVM patching operations for image operations
-class PatchImageOpCollect : public llvm::PassInfoMixin<PatchImageOpCollect> {
+class CollectImageOperations : public llvm::PassInfoMixin<CollectImageOperations> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchResourceCollect.h b/lgc/include/lgc/patch/CollectResourceUsage.h
similarity index 99%
rename from lgc/include/lgc/patch/PatchResourceCollect.h
rename to lgc/include/lgc/patch/CollectResourceUsage.h
index b91bd69f6e..fd2816a07b 100644
--- a/lgc/include/lgc/patch/PatchResourceCollect.h
+++ b/lgc/include/lgc/patch/CollectResourceUsage.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchResourceCollect.h
+ * @file  CollectResourceUsage.h
  * @brief LLPC header file: contains declaration of class lgc::PatchResourceCollect.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/FragColorExport.h b/lgc/include/lgc/patch/FragmentColorExport.h
similarity index 99%
rename from lgc/include/lgc/patch/FragColorExport.h
rename to lgc/include/lgc/patch/FragmentColorExport.h
index 1fcd756226..72cdc9589a 100644
--- a/lgc/include/lgc/patch/FragColorExport.h
+++ b/lgc/include/lgc/patch/FragmentColorExport.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  FragColorExport.h
+ * @file  FragmentColorExport.h
  * @brief LLPC header file: contains declaration of class lgc::FragColorExport.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/IncludeLlvmIr.h b/lgc/include/lgc/patch/IncludeLlvmIr.h
index b198d2c60a..ee2dcc48db 100644
--- a/lgc/include/lgc/patch/IncludeLlvmIr.h
+++ b/lgc/include/lgc/patch/IncludeLlvmIr.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  IncludeLlvmIr.h
- * @brief LLPC header file: contains declaration of class lgc::PatchLlvmIrInclusion.
+ * @brief LLPC header file: contains declaration of class lgc::IncludeLlvmIr.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -37,7 +37,7 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the pass of LLVM patch operations of including LLVM IR as a separate section in the ELF binary.
-class PatchLlvmIrInclusion : public Patch, public llvm::PassInfoMixin<PatchLlvmIrInclusion> {
+class IncludeLlvmIr : public Patch, public llvm::PassInfoMixin<IncludeLlvmIr> {
 public:
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchBufferOp.h b/lgc/include/lgc/patch/LowerBufferOperations.h
similarity index 97%
rename from lgc/include/lgc/patch/PatchBufferOp.h
rename to lgc/include/lgc/patch/LowerBufferOperations.h
index a1feacdd79..d93842f30c 100644
--- a/lgc/include/lgc/patch/PatchBufferOp.h
+++ b/lgc/include/lgc/patch/LowerBufferOperations.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchBufferOp.h
+ * @file  LowerBufferOperations.h
  * @brief LLPC header file: contains declaration of class lgc::PatchBufferOp.
  ***********************************************************************************************************************
  */
@@ -52,6 +52,7 @@ namespace lgc {
 
 class BufferAddrToPtrOp;
 class BufferDescToPtrOp;
+class ConvertToStridedBufferPointerOp;
 class StridedBufferDescToPtrOp;
 class BufferLoadDescToPtrOp;
 class StridedBufferLoadDescToPtrOp;
@@ -107,6 +108,7 @@ class BufferOpLowering {
   void visitBufferDescToPtr(BufferDescToPtrOp &descToPtr);
   void visitStridedBufferDescToPtr(StridedBufferDescToPtrOp &descToPtr);
   void visitBufferLoadDescToPtr(BufferLoadDescToPtrOp &loadDescToPtr);
+  void visitConvertToStridedBufferPointer(ConvertToStridedBufferPointerOp &convertToStrided);
   void visitStridedBufferLoadDescToPtr(StridedBufferLoadDescToPtrOp &loadDescToPtr);
   void visitStridedBufferAddrAndStrideToPtr(StridedBufferAddrAndStrideToPtrOp &addrAndStrideToPtr);
   void visitStridedIndexAdd(StridedIndexAddOp &indexAdd);
@@ -122,6 +124,7 @@ class BufferOpLowering {
   void visitICmpInst(llvm::ICmpInst &icmpInst);
   void visitInvariantStart(llvm::IntrinsicInst &intrinsic);
   void visitLoadTfeOp(LoadTfeOp &loadTfe);
+  void visitReadFirstLane(llvm::IntrinsicInst &intrinsic);
 
   void postVisitLoadInst(llvm::LoadInst &loadInst);
   void postVisitStoreInst(llvm::StoreInst &storeInst);
diff --git a/lgc/include/lgc/patch/LowerInOut.h b/lgc/include/lgc/patch/LowerInOut.h
index e165006e49..8ebc2930a5 100644
--- a/lgc/include/lgc/patch/LowerInOut.h
+++ b/lgc/include/lgc/patch/LowerInOut.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerInOut.h
- * @brief LLPC header file: contains declaration of class lgc::PatchInOutImportExport.
+ * @brief LLPC header file: contains declaration of class lgc::LowerInOut.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -43,9 +43,9 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the pass of LLVM patching operations for input import and output export.
-class PatchInOutImportExport : public Patch, public llvm::PassInfoMixin<PatchInOutImportExport> {
+class LowerInOut : public Patch, public llvm::PassInfoMixin<LowerInOut> {
 public:
-  PatchInOutImportExport();
+  LowerInOut();
 
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/LowerMulDx9Zero.h b/lgc/include/lgc/patch/LowerMulDx9Zero.h
index 400c447187..ed6dff5b1f 100644
--- a/lgc/include/lgc/patch/LowerMulDx9Zero.h
+++ b/lgc/include/lgc/patch/LowerMulDx9Zero.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerMulDx9Zero.h
- * @brief LLPC header file: contains declaration of class lgc::PatchMulDx9Zero.
+ * @brief LLPC header file: contains declaration of class lgc::LowerMulDx9Zero.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -43,9 +43,9 @@ namespace lgc {
 // ((b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b)) or
 // ((b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b)) or
 // fma((b==0.0 ? 0.0 : a), (a==0.0 ? 0.0 : b), c)
-class PatchMulDx9Zero final : public llvm::InstVisitor<PatchMulDx9Zero>, public llvm::PassInfoMixin<PatchMulDx9Zero> {
+class LowerMulDx9Zero final : public llvm::InstVisitor<LowerMulDx9Zero>, public llvm::PassInfoMixin<LowerMulDx9Zero> {
 public:
-  explicit PatchMulDx9Zero();
+  explicit LowerMulDx9Zero();
 
   llvm::PreservedAnalyses run(llvm::Function &function, llvm::FunctionAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchReadFirstLane.h b/lgc/include/lgc/patch/LowerReadFirstLane.h
similarity index 98%
rename from lgc/include/lgc/patch/PatchReadFirstLane.h
rename to lgc/include/lgc/patch/LowerReadFirstLane.h
index f7d822533f..2c6cdc523a 100644
--- a/lgc/include/lgc/patch/PatchReadFirstLane.h
+++ b/lgc/include/lgc/patch/LowerReadFirstLane.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchReadFirstLane.h
+ * @file  LowerReadFirstLane.h
  * @brief LLPC header file: contains declaration of class lgc::PatchReadFirstLane.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/MutateEntryPoint.h b/lgc/include/lgc/patch/MutateEntryPoint.h
index 639f3fca53..7a582b176f 100644
--- a/lgc/include/lgc/patch/MutateEntryPoint.h
+++ b/lgc/include/lgc/patch/MutateEntryPoint.h
@@ -46,6 +46,8 @@ namespace lgc {
 
 class UserDataOp;
 
+constexpr unsigned MemcpyScopeWorkGroup = 2;
+
 // =====================================================================================================================
 // The entry-point mutation pass
 class MutateEntryPoint : public Patch, public llvm::PassInfoMixin<MutateEntryPoint> {
@@ -55,6 +57,9 @@ class MutateEntryPoint : public Patch, public llvm::PassInfoMixin<MutateEntryPoi
 
   static llvm::StringRef name() { return "Patch LLVM for entry-point mutation"; }
 
+  static void processGroupMemcpy(GroupMemcpyOp &groupMemcpyOp, BuilderBase &builder, llvm::Value *threadIndex,
+                                 unsigned scopeSize);
+
 private:
   // A shader entry-point user data argument
   struct UserDataArg {
@@ -166,8 +171,8 @@ class MutateEntryPoint : public Patch, public llvm::PassInfoMixin<MutateEntryPoi
 
   bool isComputeWithCalls() const;
 
-  void processGroupMemcpy(llvm::Module &module);
-  void lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp);
+  void processCsGroupMemcpy(llvm::Module &module);
+  void lowerCsGroupMemcpy(GroupMemcpyOp &groupMemcpyOp);
 
   void processDriverTableLoad(llvm::Module &module);
   void lowerDriverTableLoad(LoadDriverTableEntryOp &loadDriverTablePtrOp);
diff --git a/lgc/include/lgc/patch/TcsPassthroughShader.h b/lgc/include/lgc/patch/PassthroughHullShader.h
similarity index 98%
rename from lgc/include/lgc/patch/TcsPassthroughShader.h
rename to lgc/include/lgc/patch/PassthroughHullShader.h
index 191f26a46a..0917c3c81f 100644
--- a/lgc/include/lgc/patch/TcsPassthroughShader.h
+++ b/lgc/include/lgc/patch/PassthroughHullShader.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  TcsPassthroughShader.h
+ * @file  PassthroughHullShader.h
  * @brief LLPC header file: contains declaration of class lgc::TcsPassthroughShader.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/PatchPeepholeOpt.h b/lgc/include/lgc/patch/PeepholeOptimization.h
similarity index 98%
rename from lgc/include/lgc/patch/PatchPeepholeOpt.h
rename to lgc/include/lgc/patch/PeepholeOptimization.h
index e361c730a6..b2d05c7c09 100644
--- a/lgc/include/lgc/patch/PatchPeepholeOpt.h
+++ b/lgc/include/lgc/patch/PeepholeOptimization.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchPeepholeOpt.h
+ * @file  PeepholeOptimization.h
  * @brief LLPC header file: contains declaration of class lgc::PatchPeepholeOpt.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/PatchPreparePipelineAbi.h b/lgc/include/lgc/patch/PreparePipelineAbi.h
similarity index 99%
rename from lgc/include/lgc/patch/PatchPreparePipelineAbi.h
rename to lgc/include/lgc/patch/PreparePipelineAbi.h
index e567137a12..5c025e1262 100644
--- a/lgc/include/lgc/patch/PatchPreparePipelineAbi.h
+++ b/lgc/include/lgc/patch/PreparePipelineAbi.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchPreparePipelineAbi.h
+ * @file  PreparePipelineAbi.h
  * @brief LLPC header file: contains declaration of class lgc::PatchPreparePipelineAbi.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/ScalarizeLoads.h b/lgc/include/lgc/patch/ScalarizeLoads.h
index 670667e58f..8fd2286a40 100644
--- a/lgc/include/lgc/patch/ScalarizeLoads.h
+++ b/lgc/include/lgc/patch/ScalarizeLoads.h
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  ScalarizeLoads.h
- * @brief LLPC header file: contains declaration of class lgc::PatchLoadScalarizer.
+ * @brief LLPC header file: contains declaration of class lgc::ScalarizeLoads.
  ***********************************************************************************************************************
  */
 #pragma once
@@ -40,10 +40,9 @@ namespace lgc {
 
 // =====================================================================================================================
 // Represents the pass of LLVM patching operations for scalarize load.
-class PatchLoadScalarizer final : public llvm::InstVisitor<PatchLoadScalarizer>,
-                                  public llvm::PassInfoMixin<PatchLoadScalarizer> {
+class ScalarizeLoads final : public llvm::InstVisitor<ScalarizeLoads>, public llvm::PassInfoMixin<ScalarizeLoads> {
 public:
-  explicit PatchLoadScalarizer();
+  explicit ScalarizeLoads();
 
   llvm::PreservedAnalyses run(llvm::Function &function, llvm::FunctionAnalysisManager &analysisManager);
 
diff --git a/lgc/include/lgc/patch/PatchSetupTargetFeatures.h b/lgc/include/lgc/patch/SetupTargetFeatures.h
similarity index 98%
rename from lgc/include/lgc/patch/PatchSetupTargetFeatures.h
rename to lgc/include/lgc/patch/SetupTargetFeatures.h
index de716c8561..9af3830dca 100644
--- a/lgc/include/lgc/patch/PatchSetupTargetFeatures.h
+++ b/lgc/include/lgc/patch/SetupTargetFeatures.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchSetupTargetFeatures.h
+ * @file  SetupTargetFeatures.h
  * @brief LLPC header file: contains declaration of class lgc::PatchSetupTargetFeatures.
  ***********************************************************************************************************************
  */
diff --git a/lgc/include/lgc/patch/StructurizeBuffers.h b/lgc/include/lgc/patch/StructurizeBuffers.h
new file mode 100644
index 0000000000..de92da38b3
--- /dev/null
+++ b/lgc/include/lgc/patch/StructurizeBuffers.h
@@ -0,0 +1,47 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  StructurizeBuffers.h
+ * @brief LLPC header file: contains declaration of class lgc::StructurizeBuffers.
+ ***********************************************************************************************************************
+ */
+
+#pragma once
+
+#include "lgc/patch/LowerBufferOperations.h"
+#include "llvm/IR/PassManager.h"
+namespace lgc {
+
+// =====================================================================================================================
+// Represents the pass of LLVM patching operations for structured buffer operations
+class StructurizeBuffers : public llvm::PassInfoMixin<StructurizeBuffers> {
+public:
+  llvm::PreservedAnalyses run(llvm::Function &function, llvm::FunctionAnalysisManager &analysisManager);
+
+  static llvm::StringRef name() { return "Patch LLVM for structured buffers"; }
+};
+
+} // namespace lgc
diff --git a/lgc/include/lgc/state/AbiMetadata.h b/lgc/include/lgc/state/AbiMetadata.h
index 00a8142eba..f718a9d6bb 100644
--- a/lgc/include/lgc/state/AbiMetadata.h
+++ b/lgc/include/lgc/state/AbiMetadata.h
@@ -166,6 +166,7 @@ static constexpr char SgprCount[] = ".sgpr_count";
 static constexpr char VgprLimit[] = ".vgpr_limit";
 static constexpr char SgprLimit[] = ".sgpr_limit";
 static constexpr char ThreadgroupDimensions[] = ".threadgroup_dimensions";
+static constexpr char OrigThreadgroupDimensions[] = ".orig_threadgroup_dimensions";
 static constexpr char WavefrontSize[] = ".wavefront_size";
 static constexpr char UsesUavs[] = ".uses_uavs";
 static constexpr char UsesRovs[] = ".uses_rovs";
@@ -441,9 +442,7 @@ static constexpr char UseVtxVrsRate[] = ".use_vtx_vrs_rate";
 static constexpr char BypassVtxRateCombiner[] = ".bypass_vtx_rate_combiner";
 static constexpr char BypassPrimRateCombiner[] = ".bypass_prim_rate_combiner";
 static constexpr char UseVtxGsCutFlag[] = ".use_vtx_gs_cut_flag";
-#if PAL_BUILD_GFX11
 static constexpr char UseVtxFsrSelect[] = ".use_vtx_fsr_select";
-#endif
 }; // namespace PaClVsOutCntlMetadataKey
 
 namespace GeNggSubgrpCntlMetadataKey {
@@ -993,7 +992,6 @@ typedef enum SWIZZLE_MODE_ENUM {
   SW_VAR_R__GFX10CORE = 0x0000000f,
   SW_VAR_S_X__GFX10CORE = 0x0000001d,
   SW_VAR_D_X__GFX10CORE = 0x0000001e,
-#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 || CHIP_HDR_PHOENIX1 || CHIP_HDR_STRIX1
   SW_256KB_Z__GFX11 = 0x0000000c,
   SW_256KB_S__GFX11 = 0x0000000d,
   SW_256KB_D__GFX11 = 0x0000000e,
@@ -1002,7 +1000,6 @@ typedef enum SWIZZLE_MODE_ENUM {
   SW_256KB_S_X__GFX11 = 0x0000001d,
   SW_256KB_D_X__GFX11 = 0x0000001e,
   SW_256KB_R_X__GFX11 = 0x0000001f,
-#endif
 } SWIZZLE_MODE_ENUM;
 
 } // namespace lgc
diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h
index b4375d5f85..66c6aec626 100644
--- a/lgc/include/lgc/state/PipelineState.h
+++ b/lgc/include/lgc/state/PipelineState.h
@@ -326,9 +326,6 @@ class PipelineState final : public Pipeline {
   // Gets Util::Abi::HardwareStageFlagBits for the given shader stage
   unsigned getShaderHwStageMask(ShaderStageEnum stage);
 
-  // Set the default wave size for the specified shader stage
-  void setShaderDefaultWaveSize(ShaderStageEnum stage);
-
   // Set the wave size for the specified shader stage
   void setShaderWaveSize(ShaderStageEnum stage, unsigned waveSize) {
     assert(waveSize == 32 || waveSize == 64);
@@ -336,7 +333,7 @@ class PipelineState final : public Pipeline {
   }
 
   // Whether WGP mode is enabled for the given shader stage
-  bool getShaderWgpMode(ShaderStageEnum stage) const;
+  bool getShaderWgpMode(ShaderStageEnum stage);
 
   // Get NGG control settings
   NggControl *getNggControl() { return &m_nggControl; }
@@ -610,6 +607,11 @@ class PipelineState final : public Pipeline {
   // ABI Shader Map
   void buildAbiHwShaderMap();
 
+  // Set the default wave size for the specified shader stage
+  void setShaderDefaultWaveSize(ShaderStageEnum stage);
+  // Set the default wave size for all shader stages.
+  void setAllShadersDefaultWaveSize();
+
   std::string m_lastError; // Error to be reported by getLastError()
   bool m_emitLgc = false;  // Whether -emit-lgc is on
   // Whether generating pipeline or unlinked part-pipeline
diff --git a/lgc/include/lgc/state/TargetInfo.h b/lgc/include/lgc/state/TargetInfo.h
index c0bbe768c9..539b38c860 100644
--- a/lgc/include/lgc/state/TargetInfo.h
+++ b/lgc/include/lgc/state/TargetInfo.h
@@ -56,6 +56,7 @@ struct GfxIpVersion {
 // Represents the properties of GPU device.
 struct GpuProperty {
   unsigned numShaderEngines;                  // Number of shader engines present
+  unsigned numComputeUnitsPerShaderEngine;    // Number of compute units per shader engine
   unsigned waveSize;                          // Wavefront size
   unsigned ldsSizePerThreadGroup;             // LDS size per thread group in dwords
   unsigned gsOnChipDefaultPrimsPerSubgroup;   // Default target number of primitives per subgroup for GS on-chip mode.
diff --git a/lgc/include/lgc/util/Internal.h b/lgc/include/lgc/util/Internal.h
index e44a1ab0b4..ee1c60a62f 100644
--- a/lgc/include/lgc/util/Internal.h
+++ b/lgc/include/lgc/util/Internal.h
@@ -90,6 +90,9 @@ llvm::Argument *getFunctionArgument(llvm::Function *func, unsigned idx, const ll
 // Checks if one type can be bitcasted to the other (type1 -> type2).
 bool canBitCast(const llvm::Type *ty1, const llvm::Type *ty2);
 
+// Checks if the type is supported on amdgcn_readfirstlane in the backend.
+bool isReadFirstLaneTypeSupported(const llvm::Type *ty);
+
 // Checks if the specified value actually represents a don't-care value (0xFFFFFFFF).
 bool isDontCareValue(llvm::Value *value);
 
@@ -99,6 +102,10 @@ llvm::Type *getVgprTy(llvm::Type *ty);
 
 // Helper function to create LLVM Function and update NewDbgInfoFormat flag
 llvm::Function *createFunctionHelper(llvm::FunctionType *ty, llvm::GlobalValue::LinkageTypes linkage,
-                                     llvm::Module *module, const llvm::Twine &name = "");
+                                     llvm::Module *module, bool createDbgInfo = false, const llvm::Twine &name = "");
+
+// Helper function to call LLVM Function and set debug location
+llvm::CallInst *callFunctionHelper(llvm::Function *func, llvm::ArrayRef<llvm::Value *> args,
+                                   llvm::BasicBlock *insertAtEnd);
 
 } // namespace lgc
diff --git a/lgc/include/lgc/util/MsgPackScanner.h b/lgc/include/lgc/util/MsgPackScanner.h
new file mode 100644
index 0000000000..6b0b6713e9
--- /dev/null
+++ b/lgc/include/lgc/util/MsgPackScanner.h
@@ -0,0 +1,222 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// MsgPackScanner class to read, write and incrementally update MsgPack.
+//
+// For the case that the caller has a small number of elements it wants to look at and it knows their names
+// and positions in the MsgPack document hierarchy, MsgPackScanner provides a more efficient way of doing
+// it than MsgPackDocument. MsgPackDocument builds the whole document hierarchy with maps, and thus has a
+// lot of malloc traffic, even for parts of the document that the caller is not interested in. MsgPackScanner
+// only creates a single map of the elements that the caller is interested in, so has a lot less malloc
+// traffic.
+//
+// A future project could be to change the LLVM AMDGPU PALMetadata class to use this, in the case that we are
+// compiling (it is being called from AsmPrinter), rather than assembling or disassembling. Then, this
+// code would need to be upstreamed in LLVM, with some tests.
+//
+// TODO: Does not yet implement deleting an element.
+//
+// Usage:
+//
+// 1. Write a spec for the structure of the MsgPack document and items within it that you want to read, modify
+//    or write. That is done with a static struct variable containing variables of type MsgPackScanner::Item
+//    Where an item appears in a map, you give it the key name to match.
+//
+// 2. Construct a MsgPackScanner::Spec object, passing it a pointer to the struct in (1).
+//    This can be done one time for multiple MsgPackScanners, to common up the processing it does (constructing
+//    a map of the item names).
+//
+// 3. Construct a MsgPackScanner object, passing it the MsgPackScanner::Spec from (2).
+//
+// 4. Scan a MsgPack blob using MsgPackScanner::scan() (optional, to handle the case that your code is constructing
+//    a new MsgPack blob). You can give scan() a callback function, called when an item in your spec has just
+//    been found; with that, your spec can have an anonymous item in a map, and the callback gets called for
+//    each found entry in the corresponding map in the MsgPack blob.
+//
+// 5. Use isSet() to tell if an item is set (if it was matched in the scan), and asBool(), asInt(), asString() to
+//    get a item's value.
+//
+// 6. Use setBool() and set() to update an item to a new value. If the item does not already exist in the MsgPack
+//    blob, this creates it, and any parent maps and arrays that need creating, right up to the top-level item
+//    if this is the first setBool()/set() and you are creating a new MsgPack blob.
+//
+// 7. Use write() to write the updated MsgPack blob.
+
+#pragma once
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+namespace lgc {
+
+// =====================================================================================================================
+// MsgPackScanner class to read, write and incrementally update MsgPack.
+class MsgPackScanner {
+public:
+  // Item types in the item array that the caller passes to the MsgPackScanner::Spec constructor.
+  enum class ItemType : unsigned {
+    First = 1489898298, // Arbitrarily chosen magic number
+    Scalar = First,
+    EndContainer,
+    Map,
+    Array,
+    Last = Array
+  };
+
+  // Struct for item in spec. The MsgPackScanner is passed an array of Item, each with an ItemType and optional
+  // name, which the caller can construct as a struct.
+  // MsgPackScanner treats it as a tree, where Map or Array contain further items until the matching EndContainer.
+  // Items directly contained in a Map item can be named, to match the MsgPack map key, or unnamed, in which case
+  // it matches any key. The latter is only useful if the caller provides a callback function to scan() so it gets
+  // called each time an item is matched.
+  // The whole spec is either a single ItemType::Scalar, or an ItemType::Map/ItemType::Array with child
+  // elements, terminated by ItemType::EndContainer. Nested maps/arrays must also be correctly terminated.
+  struct Item {
+    ItemType itemType;
+    const char *name;
+  };
+
+  // A representation of the spec set up for MsgPackScanner. A client might want to set up one of these in a
+  // static variable, then use it to create and use multiple MsgPackScanner objects, possibly concurrently.
+  class Spec {
+  public:
+    // Constructor given pointer to caller's struct containing Items. The supplied struct must remain valid
+    // for the lifetime of the MsgPackScanner::Spec, which must remain valid for the lifetime of any
+    // MsgPackScanner using it.
+    Spec(const void *itemStruct);
+
+    // Accessors.
+    size_t size() const { return m_itemArray.size(); }
+    const Item &operator[](size_t idx) const { return m_itemArray[idx]; }
+    llvm::ArrayRef<Item>::iterator begin() const { return m_itemArray.begin(); }
+    llvm::ArrayRef<Item>::iterator end() const { return m_itemArray.end(); }
+    // Look up a {key, parent item index}, giving an item index. Key is one of:
+    // - FNV-1a hash of name for map; or
+    // - 0 for anonymous map entry; or
+    // - index for array entry.
+    std::optional<unsigned> lookup(unsigned key, unsigned parentItemIndex) const;
+    // Given an item index, get the parent index, or UINT_MAX if none (it is the root item).
+    unsigned getParentIndex(unsigned index) const { return m_parentIndices[index]; }
+
+  private:
+    // Supplied spec.
+    llvm::ArrayRef<Item> m_itemArray;
+    // Map from {name, parent item index} to item index.
+    llvm::DenseMap<std::pair<unsigned, unsigned>, unsigned> m_itemMap;
+    // Parent item index for each item.
+    llvm::SmallVector<unsigned> m_parentIndices;
+  };
+
+  // Constructor given Spec object.
+  MsgPackScanner(const Spec &spec);
+
+  // Scan a MsgPack blob. Returns error for illegal MsgPack format, but tolerates empty blob.
+  // Cam only be called once for this MsgPackScanner object, and must be called before other methods.
+  // The StringRef for the blob is retained, as it is used in subsequent method calls.
+  // Each item that is matched has its position in the MsgPack blob remembered, so that the caller can make
+  // subsequent isSet(), asBool(), asInt(), asString(), setBool(), set() calls on it.
+  // The callback is called just after finding an item in the item array, allowing the caller to accumulate
+  // a value from an item that occurs multiple times.
+  llvm::Error scan(llvm::StringRef blob, llvm::function_ref<llvm::Error(MsgPackScanner &, const Item &)> callback = {});
+
+  // Subsequent methods specify a particular item in the spec by passing a const reference to that item in
+  // the struct that the caller passed to the MsgPackScanner::Spec constructor.
+
+  // Determine whether an item is set.
+  bool isSet(const Item &item) const;
+
+  // Get an item as a bool. Returns {} if the item has some other type, or was not found.
+  std::optional<bool> asBool(const Item &item) const;
+
+  // Get an item as an integer. Returns {} if the item has some other type, or was not found.
+  std::optional<uint64_t> asInt(const Item &item) const;
+
+  // Get an item as a StringRef. Works for a string or binary object.
+  // Returns {} if the item has some other type, or was not found.
+  std::optional<llvm::StringRef> asString(const Item &item) const;
+
+  // Set an item as a bool. This gets a different name to avoid implicit conversions from other types to bool.
+  // If the item does not exist, it is created, increasing the size of its parent map/array. If the parent
+  // map/array does not exist, it is created, and so on.
+  void setBool(const Item &item, bool value);
+
+  // Set an item as an unsigned integer.
+  // If the item does not exist, it is created, increasing the size of its parent map/array. If the parent
+  // map/array does not exist, it is created, and so on.
+  void set(const Item &item, uint64_t value);
+
+  // Set an item as a string.
+  // If the item does not exist, it is created, increasing the size of its parent map/array. If the parent
+  // map/array does not exist, it is created, and so on.
+  void set(const Item &item, llvm::StringRef value);
+
+  // Write the whole MsgPack to the stream, as modified by any set() and setBool() calls made on it.
+  void write(llvm::raw_ostream &stream);
+
+private:
+  // Get size of next object.
+  llvm::Expected<unsigned> getObjectSize() const;
+
+  // Get an item's MsgPack-encoded value.
+  llvm::StringRef getValue(const Item &item) const;
+
+  // Set an item to the new value that has just been written in MsgPack format to m_newData.
+  size_t setValue(const Item &item, size_t newOffset, size_t newSize);
+
+  // Item info gathered during scan, one for each item in the supplied spec.
+  struct ItemInfo {
+    constexpr static const size_t NoNewOffset = ~size_t(0);
+    constexpr static const size_t NoReplacementNewSize = ~size_t(0);
+
+    size_t keyOffset;                  // Offset of key, only if this item is a map entry
+    size_t offset;                     // Offset of value
+    size_t size;                       // Size of value (just the header for map/array)
+    size_t endOffset;                  // End offset, only for map or array
+    size_t newKeyOffset = NoNewOffset; // Offset of new key in m_newData, or NoNewOffset
+    size_t newKeySize;                 // Size of new key
+    size_t newOffset = NoNewOffset;    // Offset of new value (from set()) in m_newData, or NoNewOffset
+    size_t newSize;                    // Size of new value (from set()), NoReplacementNewSize if deleting old item
+                                       //  without replacing it
+    unsigned gen;                      // Generation of new data, used to ensure that we action multiple inserts at
+                                       //  the same offset in the order we created them
+  };
+
+  const Spec &m_spec;
+#ifndef NDEBUG
+  bool m_inUse = false; // For asserting if user calls scan() after set() or setBool() or scan().
+#endif
+  llvm::StringRef m_blob;
+  size_t m_next;
+  llvm::SmallVector<ItemInfo> m_itemInfos;
+  llvm::SmallString<64> m_newData;
+  // Generation of new data, used to ensure that we action multiple inserts at the same offset in the
+  // order we created them.
+  unsigned m_gen = 0;
+};
+
+} // namespace lgc
diff --git a/lgc/interface/lgc/BuilderCommon.h b/lgc/interface/lgc/BuilderCommon.h
index 4efa6d7ebc..e5d3efdb32 100644
--- a/lgc/interface/lgc/BuilderCommon.h
+++ b/lgc/interface/lgc/BuilderCommon.h
@@ -122,6 +122,9 @@ class BuilderCommon : public llvm_dialects::Builder {
 
   // Whether the type of a cooperative matrix is specified bit width.
   static bool isTypeNCooperativeMatrix(CooperativeMatrixElementType elemType, unsigned bitWidth);
+
+  // Get the bit width of the cooperative matrix element.
+  static unsigned getBitWidthOfCooperativeMatrixElement(CooperativeMatrixElementType elemType);
 };
 
 } // namespace lgc
diff --git a/lgc/interface/lgc/LgcDialect.h b/lgc/interface/lgc/LgcDialect.h
index facbaa2477..2ed87fea56 100644
--- a/lgc/interface/lgc/LgcDialect.h
+++ b/lgc/interface/lgc/LgcDialect.h
@@ -50,6 +50,7 @@ enum class CooperativeMatrixElementType : unsigned {
   BFloat16,      // 16-bit brain floating-point
   Float8,        // 8-bit floating-point
   BFloat8,       // 8-bit brain floating-point
+  Int4,          // 4-bit integer
 };
 
 // Layout is virtual concept, eg: 16bit and 32bit for matrixC will share the same layout initially.
diff --git a/lgc/interface/lgc/LgcDialect.td b/lgc/interface/lgc/LgcDialect.td
index eb4fb0f792..3831160fb3 100644
--- a/lgc/interface/lgc/LgcDialect.td
+++ b/lgc/interface/lgc/LgcDialect.td
@@ -43,9 +43,18 @@ defm CooperativeMatrixElementType : AttrEnum<"CooperativeMatrixElementType">;
 defm CooperativeMatrixLayout : AttrEnum<"CooperativeMatrixLayout">;
 defm CooperativeMatrixArithOp : AttrEnum<"CooperativeMatrixArithOp">;
 
-class LgcOp<string mnemonic_, list<Trait> traits_ = []>
+def NoDivergenceSource : LlvmEnumAttributeTrait<"NoDivergenceSource">;
+
+class DivergentLgcOp<string mnemonic_, list<Trait> traits_ = []>
     : Op<LgcDialect, mnemonic_, traits_ # [NoUnwind]>;
 
+class LgcOp<string mnemonic_, list<Trait> traits_ = []>
+    : DivergentLgcOp<mnemonic_, traits_
+#ifdef LLVM_HAVE_NODIVERGENCESOURCE_ATTR
+                                        # [NoDivergenceSource]
+#endif
+                                                              >;
+
 def BufferAddrToPtrOp : LgcOp<"buffer.addr.to.ptr", [Memory<[]>, WillReturn]> {
   let arguments = (ins I64:$address);
   let results = (outs BufferPointer:$result);
@@ -72,6 +81,18 @@ def BufferDescToPtrOp : LgcOp<"buffer.desc.to.ptr", [Memory<[]>, WillReturn]> {
   }];
 }
 
+def ConvertToStridedBufferPointerOp : LgcOp<"convert.to.strided.buffer.pointer", [Memory<[]>, WillReturn]> {
+  let arguments = (ins BufferPointer:$ptr, AttrI32:$stride);
+  let results = (outs BufferStridedPointer:$result);
+
+  let summary = "convert a fat buffer pointer into a strided buffer pointer";
+  let description = [{
+    Given a buffer pointer and a stride, returns an indexed buffer pointer to the start of the buffer.
+
+    Adjusts the stride and number of elements in the descriptor.
+  }];
+}
+
 def StridedBufferDescToPtrOp : LgcOp<"strided.buffer.desc.to.ptr", [Memory<[]>, WillReturn]> {
   let arguments = (ins V4I32:$desc);
   let results = (outs BufferStridedPointer:$result);
@@ -103,6 +124,20 @@ def StridedIndexAddOp : LgcOp<"strided.index.add", [Memory<[]>, WillReturn]> {
   }];
 }
 
+def BufferIndexOp : LgcOp<"buffer.index", [Memory<[]>, WillReturn]> {
+  let arguments = (ins BufferPointer:$ptr, AttrI32:$stride, I32:$index);
+  let results = (outs BufferPointer:$marked);
+
+  let summary = "mark a runtime array for potential conversion to a strided buffer";
+  let description = [{
+    Mark the given array as a candidate for strided buffer pointers.
+    If the runtime array represents a (RW)StructuredBuffer, we can use indexed access to its elements
+    in some cases. In other cases, we prefer to manually calculate the address.
+
+    If it is not a StructuredBuffer, we can proceed as usual.
+  }];
+}
+
 def BufferLengthOp : LgcOp<"buffer.length", [Memory<[]>, WillReturn]> {
   let arguments = (ins BufferPointer:$pointer, I32:$offset);
   let results = (outs I32:$result);
@@ -262,7 +297,7 @@ def SetMeshPrimitiveCulledOp : LgcOp<"set.mesh.primitive.culled", [Memory<[]>]>
   }];
 }
 
-def GetMeshBuiltinInputOp : LgcOp<"get.mesh.builtin.input", [Memory<[]>, WillReturn]> {
+def GetMeshBuiltinInputOp : DivergentLgcOp<"get.mesh.builtin.input", [Memory<[]>, WillReturn]> {
   let arguments = (ins AttrI32:$builtin);
   let results = (outs value:$result);
 
@@ -324,7 +359,7 @@ def GenericLocationOp : OpClass<LgcDialect> {
   }];
 }
 
-def InputImportGenericOp : LgcOp<"input.import.generic", [Memory<[]>, WillReturn]> {
+def InputImportGenericOp : DivergentLgcOp<"input.import.generic", [Memory<[]>, WillReturn]> {
   let superclass = GenericLocationOp;
 
   let arguments = (ins GenericLocationOp);
@@ -341,7 +376,7 @@ def InputImportGenericOp : LgcOp<"input.import.generic", [Memory<[]>, WillReturn
 }
 
 // TODO: Consider restricting the memory effect to inaccessible memory only.
-def OutputImportGenericOp : LgcOp<"output.import.generic", [Memory<[(read)]>, WillReturn]> {
+def OutputImportGenericOp : DivergentLgcOp<"output.import.generic", [Memory<[(read)]>, WillReturn]> {
   let superclass = GenericLocationOp;
 
   let arguments = (ins GenericLocationOp);
@@ -357,7 +392,7 @@ def OutputImportGenericOp : LgcOp<"output.import.generic", [Memory<[(read)]>, Wi
   }];
 }
 
-def InputImportInterpolatedOp : LgcOp<"input.import.interpolated", [Memory<[]>, WillReturn]> {
+def InputImportInterpolatedOp : DivergentLgcOp<"input.import.interpolated", [Memory<[]>, WillReturn]> {
   let superclass = GenericLocationOp;
 
   let arguments = (ins GenericLocationOp, AttrI32:$interp_mode, value:$interp_value);
@@ -530,7 +565,7 @@ def SubgroupRotateOp : LgcOp<"subgroup.rotate", [NoUnwind, Convergent]> {
   }];
 }
 
-def CooperativeRowAccLoadOp : LgcOp<"cooperative.rowacc.load", [Memory<[(read)]>, WillReturn]> {
+def CooperativeRowAccLoadOp : DivergentLgcOp<"cooperative.rowacc.load", [Memory<[(read)]>, WillReturn]> {
   let arguments = (ins value:$pointer, I32:$stride, CooperativeMatrixElementType:$elem_type, CooperativeMatrixMemoryAccess:$memory_access);
   let results = (outs value:$result);
 
@@ -573,7 +608,7 @@ def CooperativeRowAccStoreOp : LgcOp<"cooperative.rowacc.store", [Memory<[(write
   }];
 }
 
-def CooperativeRowAccAccumulateModeOp : LgcOp<"cooperative.rowacc.accumulate.mode", [Memory<[]>, WillReturn]> {
+def CooperativeRowAccAccumulateModeOp : DivergentLgcOp<"cooperative.rowacc.accumulate.mode", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$row_acc, CooperativeMatrixElementType:$elem_type);
   let results = (outs (eq $row_acc):$result);
 
@@ -588,7 +623,7 @@ def CooperativeRowAccAccumulateModeOp : LgcOp<"cooperative.rowacc.accumulate.mod
   }];
 }
 
-def CooperativeRowAccFinalizeModeOp : LgcOp<"cooperative.rowacc.finalize.mode", [Memory<[]>, WillReturn]> {
+def CooperativeRowAccFinalizeModeOp : DivergentLgcOp<"cooperative.rowacc.finalize.mode", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$row_acc, CooperativeMatrixElementType:$elem_type);
   let results = (outs (eq $row_acc):$result);
 
@@ -603,7 +638,7 @@ def CooperativeRowAccFinalizeModeOp : LgcOp<"cooperative.rowacc.finalize.mode",
   }];
 }
 
-def CooperativeRowAccSplatOp : LgcOp<"cooperative.rowacc.splat", [Memory<[]>, WillReturn]> {
+def CooperativeRowAccSplatOp : DivergentLgcOp<"cooperative.rowacc.splat", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$scalar, CooperativeMatrixElementType:$elem_type);
   let results = (outs value:$result);
 
@@ -618,7 +653,7 @@ def CooperativeRowAccSplatOp : LgcOp<"cooperative.rowacc.splat", [Memory<[]>, Wi
   }];
 }
 
-def CooperativeRowAccSumAccumulateOp : LgcOp<"cooperative.rowacc.sum.accumulate", [Memory<[]>, WillReturn]> {
+def CooperativeRowAccSumAccumulateOp : DivergentLgcOp<"cooperative.rowacc.sum.accumulate", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$matrix, CooperativeMatrixElementType:$matrix_elem_type, CooperativeMatrixLayout:$matrix_layout, value:$row_acc, CooperativeMatrixElementType:$row_acc_elem_type, AttrI1:$is_signed);
   let results = (outs value:$result);
 
@@ -637,7 +672,7 @@ def CooperativeRowAccSumAccumulateOp : LgcOp<"cooperative.rowacc.sum.accumulate"
   }];
 }
 
-def CooperativeRowAccScalarOp : LgcOp<"cooperative.rowacc.scalar", [Memory<[]>, WillReturn]> {
+def CooperativeRowAccScalarOp : DivergentLgcOp<"cooperative.rowacc.scalar", [Memory<[]>, WillReturn]> {
   let arguments = (ins CooperativeMatrixArithOp:$binop, value:$row_acc, CooperativeMatrixElementType:$elem_type, value:$scalar, AttrI1:$accumulate_mode);
   let results = (outs (eq $row_acc):$result);
 
@@ -653,7 +688,7 @@ def CooperativeRowAccScalarOp : LgcOp<"cooperative.rowacc.scalar", [Memory<[]>,
   }];
 }
 
-def CooperativeRowAccExpandOp : LgcOp<"cooperative.rowacc.expand", [Memory<[]>, WillReturn]> {
+def CooperativeRowAccExpandOp : DivergentLgcOp<"cooperative.rowacc.expand", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$row_acc, CooperativeMatrixElementType:$row_acc_elem_type, CooperativeMatrixElementType:$matrix_elem_type, CooperativeMatrixLayout:$matrix_layout, AttrI1:$col_major);
   let results = (outs value:$result);
 
@@ -707,7 +742,7 @@ def CooperativeMatrixLengthOp : LgcOp<"cooperative.matrix.length", [Memory<[]>,
   }];
 }
 
-def CooperativeMatrixLoadOp : LgcOp<"cooperative.matrix.load", [Memory<[(read)]>, Convergent, WillReturn]> {
+def CooperativeMatrixLoadOp : DivergentLgcOp<"cooperative.matrix.load", [Memory<[(read)]>, Convergent, WillReturn]> {
   let arguments = (ins value:$pointer, value:$stride, AttrI1:$col_major, CooperativeMatrixElementType:$elem_type,
                    CooperativeMatrixLayout:$layout, AttrI32:$memory_access, AttrI32:$alignment);
   let results = (outs value:$result);
@@ -762,7 +797,7 @@ def CooperativeMatrixStoreOp : LgcOp<"cooperative.matrix.store", [Memory<[(write
   }];
 }
 
-def CooperativeMatrixFillOp : LgcOp<"cooperative.matrix.fill", [Memory<[]>, WillReturn]> {
+def CooperativeMatrixFillOp : DivergentLgcOp<"cooperative.matrix.fill", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$scalar, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
   let results = (outs value:$result);
 
@@ -778,7 +813,7 @@ def CooperativeMatrixFillOp : LgcOp<"cooperative.matrix.fill", [Memory<[]>, Will
   }];
 }
 
-def CooperativeMatrixExtractOp : LgcOp<"cooperative.matrix.extract", [Memory<[]>, WillReturn]> {
+def CooperativeMatrixExtractOp : DivergentLgcOp<"cooperative.matrix.extract", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$matrix, value:$index, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
   let results = (outs value:$result);
 
@@ -795,7 +830,7 @@ def CooperativeMatrixExtractOp : LgcOp<"cooperative.matrix.extract", [Memory<[]>
   }];
 }
 
-def CooperativeMatrixInsertOp : LgcOp<"cooperative.matrix.insert", [Memory<[]>, WillReturn]> {
+def CooperativeMatrixInsertOp : DivergentLgcOp<"cooperative.matrix.insert", [Memory<[]>, WillReturn]> {
   let arguments = (ins value:$matrix, value:$insert_value, value:$index, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
   let results = (outs value:$result);
 
@@ -813,7 +848,8 @@ def CooperativeMatrixInsertOp : LgcOp<"cooperative.matrix.insert", [Memory<[]>,
   }];
 }
 
-def CooperativeMatrixConvertOp : LgcOp<"cooperative.matrix.convert", [Memory<[(read)]>, Convergent, WillReturn]> {
+def CooperativeMatrixConvertOp : DivergentLgcOp<"cooperative.matrix.convert", [Memory<[(read)]>, Convergent,
+    WillReturn]> {
   let arguments = (ins AttrI32:$cast_op, value:$source, CooperativeMatrixElementType:$src_elem_type, CooperativeMatrixElementType:$dst_elem_type,
                    CooperativeMatrixLayout:$src_layout, CooperativeMatrixLayout:$dst_layout);
   let results = (outs value:$result);
@@ -833,7 +869,7 @@ def CooperativeMatrixConvertOp : LgcOp<"cooperative.matrix.convert", [Memory<[(r
   }];
 }
 
-def CooperativeMatrixTransposeOp : LgcOp<"cooperative.matrix.transpose", [Convergent, WillReturn]> {
+def CooperativeMatrixTransposeOp : DivergentLgcOp<"cooperative.matrix.transpose", [Convergent, WillReturn]> {
   let arguments = (ins value:$matrix, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
   let results = (outs value:$result);
 
@@ -849,7 +885,7 @@ def CooperativeMatrixTransposeOp : LgcOp<"cooperative.matrix.transpose", [Conver
   }];
 }
 
-def CooperativeMatrixBinaryOp : LgcOp<"cooperative.matrix.binary", [Convergent, WillReturn]> {
+def CooperativeMatrixBinaryOp : DivergentLgcOp<"cooperative.matrix.binary", [Convergent, WillReturn]> {
   let arguments = (ins CooperativeMatrixArithOp:$arith_op, value:$lhs, value:$rhs, CooperativeMatrixElementType:$elem_type,
                    CooperativeMatrixLayout:$layout);
   let results = (outs value:$result);
@@ -869,7 +905,7 @@ def CooperativeMatrixBinaryOp : LgcOp<"cooperative.matrix.binary", [Convergent,
   }];
 }
 
-def CooperativeMatrixTimesScalarOp : LgcOp<"cooperative.matrix.times.scalar", [Convergent, WillReturn]> {
+def CooperativeMatrixTimesScalarOp : DivergentLgcOp<"cooperative.matrix.times.scalar", [Convergent, WillReturn]> {
   let arguments = (ins value:$matrix, value:$scalar, CooperativeMatrixElementType:$elem_type,
                    CooperativeMatrixLayout:$layout);
   let results = (outs value:$result);
@@ -887,7 +923,7 @@ def CooperativeMatrixTimesScalarOp : LgcOp<"cooperative.matrix.times.scalar", [C
   }];
 }
 
-def CooperativeMatrixMulAddOp : LgcOp<"cooperative.matrix.muladd", [Convergent, WillReturn]> {
+def CooperativeMatrixMulAddOp : DivergentLgcOp<"cooperative.matrix.muladd", [Convergent, WillReturn]> {
   let arguments = (ins value:$matrix_a, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b,
                    AttrI1:$is_sat_or_opsel, AttrI1:$is_tied, CooperativeMatrixElementType:$matrix_a_elem_type,
                    CooperativeMatrixElementType:$matrix_b_elem_type, CooperativeMatrixElementType:$matrix_c_elem_type);
@@ -919,7 +955,7 @@ def CooperativeMatrixMulAddOp : LgcOp<"cooperative.matrix.muladd", [Convergent,
   }];
 }
 
-def CooperativeMatrixPackOp : LgcOp<"cooperative.matrix.pack", [Memory<[(read)]>, WillReturn]> {
+def CooperativeMatrixPackOp : DivergentLgcOp<"cooperative.matrix.pack", [Memory<[(read)]>, WillReturn]> {
   let arguments = (ins value:$matrix_c_lo, value:$matrix_c_hi);
   let results = (outs value:$result);
 
@@ -934,7 +970,7 @@ def CooperativeMatrixPackOp : LgcOp<"cooperative.matrix.pack", [Memory<[(read)]>
   }];
 }
 
-def CooperativeMatrixUnPackOp : LgcOp<"cooperative.matrix.unpack", [Memory<[(read)]>, WillReturn]> {
+def CooperativeMatrixUnPackOp : DivergentLgcOp<"cooperative.matrix.unpack", [Memory<[(read)]>, WillReturn]> {
   let arguments = (ins value:$packed_matrix, AttrI1:$get_upper_half);
   let results = (outs value:$result);
 
@@ -952,7 +988,7 @@ def CooperativeMatrixUnPackOp : LgcOp<"cooperative.matrix.unpack", [Memory<[(rea
   }];
 }
 
-def SparsityIndexLoadOp : LgcOp<"sparsityindex.load", [Memory<[(read)]>, Convergent, WillReturn]> {
+def SparsityIndexLoadOp : DivergentLgcOp<"sparsityindex.load", [Memory<[(read)]>, Convergent, WillReturn]> {
   let arguments = (ins value:$pointer, value:$stride, AttrI1:$col_major, AttrI32:$memory_access);
   let results = (outs value:$result);
 
@@ -974,7 +1010,7 @@ def SparsityIndexLoadOp : LgcOp<"sparsityindex.load", [Memory<[(read)]>, Converg
   }];
 }
 
-def SparseCooperativeMatrixMulAddOp : LgcOp<"sparseCooperativeMatrix.muladd", [Convergent, WillReturn]> {
+def SparseCooperativeMatrixMulAddOp : DivergentLgcOp<"sparseCooperativeMatrix.muladd", [Convergent, WillReturn]> {
   let arguments = (ins value:$matrix_a, value:$sparse_index, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b,
                    AttrI1:$is_sat, CooperativeMatrixElementType:$accu_elem_type,
                    CooperativeMatrixElementType:$factor_elem_type);
diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h
index bca1917eae..9a2cce48f5 100644
--- a/lgc/interface/lgc/Pipeline.h
+++ b/lgc/interface/lgc/Pipeline.h
@@ -197,11 +197,14 @@ union Options {
     bool disableSampleCoverageAdjust;              // Disable the adjustment of sample coverage
     bool forceFragColorDummyExport;                // Force dummy export is added to fragment shader color export.
     unsigned reserved22;
-    bool dynamicTopology; // Whether primitive topology is dynamic.
+    bool dynamicTopology;    // Whether primitive topology is dynamic.
+    bool robustBufferAccess; // Enable the core robust buffer access
     bool reserved23;
-    bool forceUserDataSpill;    // Whether to force all user data to be spilled (Currently only for RT).
-    bool enableMapClipDistMask; // For OGL only, whether to remap the clip distances.
-    unsigned clipPlaneMask;     // For OGL only, defines the bitmask for enabling/disabling clip planes.
+    bool forceUserDataSpill;     // Whether to force all user data to be spilled (Currently only for RT).
+    bool optimizePointSizeWrite; // Optimize the write of PointSize in the last vertex processing stage by
+                                 // eliminating it if the write value is 1.0.
+    bool enableMapClipDistMask;  // For OGL only, whether to remap the clip distances.
+    unsigned clipPlaneMask;      // For OGL only, defines the bitmask for enabling/disabling clip planes.
   };
 };
 static_assert(sizeof(Options) == sizeof(Options::u32All));
@@ -710,6 +713,9 @@ struct ComputeShaderMode {
   unsigned subgroupSize;               // Override for the wave size if it is non-zero
   DerivativeMode derivativeMode;       // derivativeMode for computeShader
   unsigned noLocalInvocationIdInCalls; // For compute with calls, assume local invocation ID is never used in callees
+  unsigned origWorkgroupSizeX;         // X dimension of original workgroup size. 0 means no original size
+  unsigned origWorkgroupSizeY;         // Y dimension of original workgroup size. If X is non-zero, Y must be non-zero
+  unsigned origWorkgroupSizeZ;         // Z dimension of original workgroup size. If X is non-zero, Z must be non-zero
 };
 
 // Enum passed to Pipeline::irLink to give information on whether this is a whole or part pipeline.
diff --git a/lgc/interface/lgc/RegStackUsage.h b/lgc/interface/lgc/RegStackUsage.h
new file mode 100644
index 0000000000..421ce14038
--- /dev/null
+++ b/lgc/interface/lgc/RegStackUsage.h
@@ -0,0 +1,82 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// Extraction, merging and inserting reg/stack usage in PAL metadata between different ELFs.
+// A front-end can use this to propagate register and stack usage from library ELFs up to a compute
+// shader ELF.
+
+#pragma once
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Module;
+} // namespace llvm
+
+namespace lgc {
+
+class RegStackUsageImpl;
+
+// Class to parse reg/stack usage from PAL metadata and merge it back.
+class RegStackUsage {
+public:
+  ~RegStackUsage();
+
+  // Construct empty, ready to use merge() to accumulate reg/stack usage in "this".
+  RegStackUsage();
+
+  // Construct from ELF blob. This reads the reg/stack usage from the ELF's PAL metadata.
+  //
+  // @param elfBlob : The ELF blob; must remain valid for the lifetime of the RegStackUsage object
+  // @param maxTraceRayDepth : Max traceRay recursion depth for this shader as specified by the app; 0 for traversal
+  // @param rayGenUsage : bitmap of which rayGens can reach this shader, with bit 63 covering all rayGens
+  //                      beyond the first 63; 0 for traversal
+  RegStackUsage(llvm::StringRef elfBlob, unsigned maxTraceRayDepth, uint64_t rayGenUsage);
+
+  // Construct from Module. This reads the reg/stack usage from IR metadata, as written by writeMetadata().
+  RegStackUsage(const llvm::Module &module);
+
+  // Write the reg/stack usage into IR metadata.
+  void writeMetadata(llvm::Module &module) const;
+
+  // Merge reg/stack usage from one shader ELF into the accumulated merged usage in "this".
+  void merge(const RegStackUsage &shaderUsage);
+
+  // Finalize merged usage in "this" (that comes from indirect shaders), merge into the supplied ELF's usage,
+  // and update the PAL metadata in the ELF.
+  //
+  // @param (in/out) elfBuffer : Buffer containing ELF to read and update
+  // @param startOffset : Start offset of the ELF in the buffer
+  // @param Alignment of frontend stack for global CPS; 0 for scratch CPS
+  //
+  void finalizeAndUpdate(llvm::SmallVectorImpl<char> &elfBuffer, size_t startOffset, unsigned frontendGlobalAlignment);
+
+private:
+  std::unique_ptr<RegStackUsageImpl> m_impl;
+};
+
+} // namespace lgc
diff --git a/lgc/patch/AddLoopMetadata.cpp b/lgc/patch/AddLoopMetadata.cpp
index d8d79d6b0c..6666075f45 100644
--- a/lgc/patch/AddLoopMetadata.cpp
+++ b/lgc/patch/AddLoopMetadata.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  AddLoopMetadata.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchLoopMetadata.
+ * @brief LLPC source file: contains implementation of class lgc::AddLoopMetadata.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/AddLoopMetadata.h"
@@ -35,7 +35,7 @@
 #include "llvm/Support/Debug.h"
 #include <vector>
 
-#define DEBUG_TYPE "lgc-patch-loop-metadata"
+#define DEBUG_TYPE "lgc-add-loop-metadata"
 
 using namespace llvm;
 using namespace lgc;
@@ -54,8 +54,8 @@ typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Loop, LoopStandardAnaly
 // @param prefixesToRemove : metadata prefixes to be removed
 // @param newMetadata : the new metadata to be added
 // @param conditional : true if the new metadata is only to be added if one or more prefixes was removed
-MDNode *PatchLoopMetadata::updateMetadata(MDNode *loopId, ArrayRef<StringRef> prefixesToRemove, Metadata *newMetadata,
-                                          bool conditional) {
+MDNode *AddLoopMetadata::updateMetadata(MDNode *loopId, ArrayRef<StringRef> prefixesToRemove, Metadata *newMetadata,
+                                        bool conditional) {
   bool found = false;
   SmallVector<Metadata *, 4> mds;
   // Reserve first location for self reference to the loopId metadata node.
@@ -84,7 +84,7 @@ MDNode *PatchLoopMetadata::updateMetadata(MDNode *loopId, ArrayRef<StringRef> pr
 };
 
 // =====================================================================================================================
-PatchLoopMetadata::PatchLoopMetadata()
+AddLoopMetadata::AddLoopMetadata()
     : m_context(nullptr), m_forceLoopUnrollCount(0), m_disableLoopUnroll(false), m_disableLicmThreshold(0),
       m_unrollHintThreshold(0), m_dontUnrollHintThreshold(0) {
 }
@@ -96,13 +96,13 @@ PatchLoopMetadata::PatchLoopMetadata()
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @param [in/out] loopAnalysisResult : Loop standard analysis results
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchLoopMetadata::run(Loop &loop, LoopAnalysisManager &analysisManager,
-                                         LoopStandardAnalysisResults &loopAnalysisResults, LPMUpdater &) {
+PreservedAnalyses AddLoopMetadata::run(Loop &loop, LoopAnalysisManager &analysisManager,
+                                       LoopStandardAnalysisResults &loopAnalysisResults, LPMUpdater &) {
   Module *module = loop.getHeader()->getModule();
   const auto &mamProxy = analysisManager.getResult<ModuleAnalysisManagerLoopProxy>(loop, loopAnalysisResults);
   PipelineState *pipelineState = mamProxy.getCachedResult<PipelineStateWrapper>(*module)->getPipelineState();
 
-  LLVM_DEBUG(dbgs() << "Run the pass lgc-patch-loop-metadata\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Add-Loop-Metadata\n");
 
   Function *func = loop.getHeader()->getFirstNonPHI()->getFunction();
   PipelineState *mPipelineState = pipelineState;
diff --git a/lgc/patch/PatchWorkarounds.cpp b/lgc/patch/ApplyWorkarounds.cpp
similarity index 99%
rename from lgc/patch/PatchWorkarounds.cpp
rename to lgc/patch/ApplyWorkarounds.cpp
index 6c9b1fa566..ad7213f703 100644
--- a/lgc/patch/PatchWorkarounds.cpp
+++ b/lgc/patch/ApplyWorkarounds.cpp
@@ -24,12 +24,12 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchWorkarounds.cpp
+ * @file  ApplyWorkarounds.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchWorkarounds.
  ***********************************************************************************************************************
  */
 
-#include "lgc/patch/PatchWorkarounds.h"
+#include "lgc/patch/ApplyWorkarounds.h"
 #include "lgc/state/PipelineShaders.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
diff --git a/lgc/patch/CollectImageOperations.cpp b/lgc/patch/CollectImageOperations.cpp
index 069c3fe5c2..edc0717e71 100644
--- a/lgc/patch/CollectImageOperations.cpp
+++ b/lgc/patch/CollectImageOperations.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  CollectImageOperations.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchImageOpCollect.
+ * @brief LLPC source file: contains implementation of class lgc::CollectImageOperations.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/CollectImageOperations.h"
@@ -34,7 +34,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "lgc-patch-image-op-collect"
+#define DEBUG_TYPE "lgc-collect-image-operations"
 
 using namespace llvm;
 using namespace lgc;
@@ -47,10 +47,10 @@ namespace lgc {
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchImageOpCollect::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses CollectImageOperations::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
 
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Image-Op-Collect\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Collect-Image-Operations\n");
 
   for (Function &func : module) {
     if (!func.isIntrinsic())
diff --git a/lgc/patch/PatchResourceCollect.cpp b/lgc/patch/CollectResourceUsage.cpp
similarity index 97%
rename from lgc/patch/PatchResourceCollect.cpp
rename to lgc/patch/CollectResourceUsage.cpp
index 85d8ec0f06..40dec85d89 100644
--- a/lgc/patch/PatchResourceCollect.cpp
+++ b/lgc/patch/CollectResourceUsage.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchResourceCollect.cpp
+ * @file  CollectResourceUsage.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchResourceCollect.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchResourceCollect.h"
+#include "lgc/patch/CollectResourceUsage.h"
 #include "MeshTaskShader.h"
 #include "NggPrimShader.h"
 #include "lgc/Builder.h"
@@ -343,7 +343,8 @@ bool PatchResourceCollect::canUseNggCulling(Module *module) {
       if (m_pipelineState->getOptions().dynamicTopology || m_pipelineState->isUnlinked())
         return false;
       // Check primitive type specified in pipeline state
-      if (primType < PrimitiveType::TriangleList)
+      if (primType == PrimitiveType::Point || primType == PrimitiveType::LineList ||
+          primType == PrimitiveType::LineStrip || primType == PrimitiveType::Rect)
         return false;
     }
   }
@@ -490,6 +491,7 @@ bool PatchResourceCollect::checkGsOnChipValidity() {
     case PrimitiveType::TriangleFan:
     case PrimitiveType::TriangleListAdjacency:
     case PrimitiveType::TriangleStripAdjacency:
+    case PrimitiveType::Rect:
       inVertsPerPrim = 3;
       break;
     case PrimitiveType::Patch:
@@ -1275,16 +1277,26 @@ void PatchResourceCollect::visitCallInst(CallInst &callInst) {
   } else if (mangledName.starts_with(lgcName::OutputExportGeneric)) {
     m_outputCalls.push_back(&callInst);
   } else if (mangledName.starts_with(lgcName::OutputExportBuiltIn)) {
-    // NOTE: If an output value is unspecified, we can safely drop it and remove the output export call.
-    // Currently, do this for geometry shader.
-    if (m_shaderStage == ShaderStage::Geometry) {
+    // NOTE: If an output value is unspecified, we can safely drop it and remove the output export call for the last
+    // vertex processing stage (Mesh shader has different processing).
+    if (m_pipelineState->getLastVertexProcessingStage() == m_shaderStage && m_shaderStage != ShaderStage::Mesh) {
+      unsigned builtInId = cast<ConstantInt>(callInst.getOperand(0))->getZExtValue();
       auto outputValue = callInst.getArgOperand(callInst.arg_size() - 1);
+      bool builtInActive = true;
+
       if (isa<UndefValue>(outputValue) || isa<PoisonValue>(outputValue))
-        m_deadCalls.push_back(&callInst);
-      else {
-        unsigned builtInId = cast<ConstantInt>(callInst.getOperand(0))->getZExtValue();
-        m_activeOutputBuiltIns.insert(builtInId);
+        builtInActive = false;
+
+      if (m_pipelineState->getOptions().optimizePointSizeWrite && builtInId == BuiltInPointSize) {
+        // Remove the write of PointSize if its write value is 1.0.
+        if (isa<ConstantFP>(outputValue) && cast<ConstantFP>(outputValue)->getValueAPF().convertToFloat() == 1.0)
+          builtInActive = false;
       }
+
+      if (builtInActive)
+        m_activeOutputBuiltIns.insert(builtInId);
+      else
+        m_deadCalls.push_back(&callInst);
     }
   } else if (mangledName.starts_with(lgcName::OutputExportXfb)) {
     auto outputValue = callInst.getArgOperand(callInst.arg_size() - 1);
@@ -1536,34 +1548,107 @@ void PatchResourceCollect::clearInactiveBuiltInInput() {
 // =====================================================================================================================
 // Clears inactive (those actually unused) outputs.
 void PatchResourceCollect::clearInactiveBuiltInOutput() {
-  // Clear inactive output builtins
-  if (m_shaderStage == ShaderStage::Geometry) {
-    auto &builtInUsage = m_resUsage->builtInUsage.gs;
+  // Clear inactive output built-ins for the last vertex processing stage (Mesh shader has different processing).
+  if (m_pipelineState->getLastVertexProcessingStage() == m_shaderStage && m_shaderStage != ShaderStage::Mesh) {
+    auto &builtInOutLocMap = m_resUsage->inOutUsage.builtInOutputLocMap;
+
+    if (m_shaderStage == ShaderStage::Geometry) {
+      auto &builtInUsage = m_resUsage->builtInUsage.gs;
 
-    if (builtInUsage.position && m_activeOutputBuiltIns.find(BuiltInPosition) == m_activeOutputBuiltIns.end())
-      builtInUsage.position = false;
+      if (builtInUsage.position && m_activeOutputBuiltIns.count(BuiltInPosition) == 0)
+        builtInUsage.position = false;
 
-    if (builtInUsage.pointSize && m_activeOutputBuiltIns.find(BuiltInPointSize) == m_activeOutputBuiltIns.end())
-      builtInUsage.pointSize = false;
+      if (builtInUsage.pointSize && m_activeOutputBuiltIns.count(BuiltInPointSize) == 0)
+        builtInUsage.pointSize = false;
 
-    if (builtInUsage.clipDistance && m_activeOutputBuiltIns.find(BuiltInClipDistance) == m_activeOutputBuiltIns.end())
-      builtInUsage.clipDistance = false;
+      if (builtInUsage.clipDistance > 0 && m_activeOutputBuiltIns.count(BuiltInClipDistance) == 0) {
+        builtInUsage.clipDistance = 0;
+        builtInOutLocMap.erase(BuiltInClipDistance);
+      }
 
-    if (builtInUsage.cullDistance && m_activeOutputBuiltIns.find(BuiltInCullDistance) == m_activeOutputBuiltIns.end())
-      builtInUsage.cullDistance = false;
+      if (builtInUsage.cullDistance > 0 && m_activeOutputBuiltIns.count(BuiltInCullDistance) == 0) {
+        builtInUsage.cullDistance = 0;
+        builtInOutLocMap.erase(BuiltInCullDistance);
+      }
 
-    if (builtInUsage.primitiveId && m_activeOutputBuiltIns.find(BuiltInPrimitiveId) == m_activeOutputBuiltIns.end())
-      builtInUsage.primitiveId = false;
+      if (builtInUsage.primitiveId && m_activeOutputBuiltIns.count(BuiltInPrimitiveId) == 0) {
+        builtInUsage.primitiveId = false;
+        builtInOutLocMap.erase(BuiltInPrimitiveId);
+      }
 
-    if (builtInUsage.layer && m_activeOutputBuiltIns.find(BuiltInLayer) == m_activeOutputBuiltIns.end())
-      builtInUsage.layer = false;
+      if (builtInUsage.layer && m_activeOutputBuiltIns.count(BuiltInLayer) == 0) {
+        builtInUsage.layer = false;
+        builtInOutLocMap.erase(BuiltInLayer);
+      }
 
-    if (builtInUsage.viewportIndex && m_activeOutputBuiltIns.find(BuiltInViewportIndex) == m_activeOutputBuiltIns.end())
-      builtInUsage.viewportIndex = false;
+      if (builtInUsage.viewportIndex && m_activeOutputBuiltIns.count(BuiltInViewportIndex) == 0) {
+        builtInUsage.viewportIndex = false;
+        builtInOutLocMap.erase(BuiltInViewportIndex);
+      }
+
+      if (builtInUsage.primitiveShadingRate && m_activeOutputBuiltIns.count(BuiltInPrimitiveShadingRate) == 0)
+        builtInUsage.primitiveShadingRate = false;
+    } else if (m_shaderStage == ShaderStage::TessEval) {
+      auto &builtInUsage = m_resUsage->builtInUsage.tes;
+
+      if (builtInUsage.position && m_activeOutputBuiltIns.count(BuiltInPosition) == 0)
+        builtInUsage.position = false;
+
+      if (builtInUsage.pointSize && m_activeOutputBuiltIns.count(BuiltInPointSize) == 0)
+        builtInUsage.pointSize = false;
+
+      if (builtInUsage.clipDistance > 0 && m_activeOutputBuiltIns.count(BuiltInClipDistance) == 0) {
+        builtInUsage.clipDistance = 0;
+        builtInOutLocMap.erase(BuiltInClipDistance);
+      }
 
-    if (builtInUsage.primitiveShadingRate &&
-        m_activeOutputBuiltIns.find(BuiltInPrimitiveShadingRate) == m_activeOutputBuiltIns.end())
-      builtInUsage.primitiveShadingRate = false;
+      if (builtInUsage.cullDistance > 0 && m_activeOutputBuiltIns.count(BuiltInCullDistance) == 0) {
+        builtInUsage.cullDistance = 0;
+        builtInOutLocMap.erase(BuiltInCullDistance);
+      }
+
+      if (builtInUsage.layer && m_activeOutputBuiltIns.count(BuiltInLayer) == 0) {
+        builtInUsage.layer = false;
+        builtInOutLocMap.erase(BuiltInLayer);
+      }
+
+      if (builtInUsage.viewportIndex && m_activeOutputBuiltIns.count(BuiltInViewportIndex) == 0) {
+        builtInUsage.viewportIndex = false;
+        builtInOutLocMap.erase(BuiltInViewportIndex);
+      }
+    } else {
+      assert(m_shaderStage == ShaderStage::Vertex);
+      auto &builtInUsage = m_resUsage->builtInUsage.vs;
+
+      if (builtInUsage.position && m_activeOutputBuiltIns.count(BuiltInPosition) == 0)
+        builtInUsage.position = false;
+
+      if (builtInUsage.pointSize && m_activeOutputBuiltIns.count(BuiltInPointSize) == 0)
+        builtInUsage.pointSize = false;
+
+      if (builtInUsage.clipDistance > 0 && m_activeOutputBuiltIns.count(BuiltInClipDistance) == 0) {
+        builtInUsage.clipDistance = 0;
+        builtInOutLocMap.erase(BuiltInClipDistance);
+      }
+
+      if (builtInUsage.cullDistance > 0 && m_activeOutputBuiltIns.count(BuiltInCullDistance) == 0) {
+        builtInUsage.cullDistance = 0;
+        builtInOutLocMap.erase(BuiltInCullDistance);
+      }
+
+      if (builtInUsage.layer && m_activeOutputBuiltIns.count(BuiltInLayer) == 0) {
+        builtInUsage.layer = false;
+        builtInOutLocMap.erase(BuiltInLayer);
+      }
+
+      if (builtInUsage.viewportIndex && m_activeOutputBuiltIns.count(BuiltInViewportIndex) == 0) {
+        builtInUsage.viewportIndex = false;
+        builtInOutLocMap.erase(BuiltInViewportIndex);
+      }
+
+      if (builtInUsage.primitiveShadingRate && m_activeOutputBuiltIns.count(BuiltInPrimitiveShadingRate) == 0)
+        builtInUsage.primitiveShadingRate = false;
+    }
   }
 }
 
@@ -3770,8 +3855,7 @@ void PatchResourceCollect::clearUndefinedOutput() {
   // NOTE: If a vector or all used channels in a location are not specified, we can safely drop it and remove the output
   // export call
   struct CandidateInfo {
-    unsigned undefMask = 0;
-    unsigned usedMask = 0;
+    bool isLocKept = false;
     SmallVector<CallInst *> candidateCalls;
   };
   // Collect candidate info with undefined value at a location.
@@ -3781,34 +3865,24 @@ void PatchResourceCollect::clearUndefinedOutput() {
     auto outputValue = call->getArgOperand(call->arg_size() - 1);
     bool isUndefVal = isa<UndefValue>(outputValue) || isa<PoisonValue>(outputValue);
     unsigned index = (m_shaderStage == ShaderStage::Mesh || m_shaderStage == ShaderStage::TessControl) ? 2 : 1;
-    bool isDynElemIndexing = !isa<ConstantInt>(call->getArgOperand(index));
+    if (!isa<ConstantInt>(call->getArgOperand(index)))
+      isUndefVal = false; // keep the call
 
     InOutLocationInfo locInfo;
     locInfo.setLocation(cast<ConstantInt>(call->getArgOperand(0))->getZExtValue());
     if (m_shaderStage == ShaderStage::Geometry)
       locInfo.setStreamId(cast<ConstantInt>(call->getArgOperand(2))->getZExtValue());
 
-    unsigned undefMask = 0;
-    unsigned usedMask = 0;
-    if (isDynElemIndexing)
-      usedMask = 1; // keep the call
-    else {
-      const unsigned elemIdx = cast<ConstantInt>(call->getArgOperand(index))->getZExtValue();
-      usedMask = 1 << elemIdx;
-      if (isUndefVal)
-        undefMask = 1 << elemIdx;
-    }
-
     auto iter = locCandidateInfoMap.find(locInfo);
     if (iter == locCandidateInfoMap.end()) {
       CandidateInfo candidataInfo;
-      candidataInfo.undefMask = undefMask;
-      candidataInfo.usedMask = usedMask;
+      candidataInfo.isLocKept = !isUndefVal;
       candidataInfo.candidateCalls.push_back(call);
       locCandidateInfoMap[locInfo] = candidataInfo;
     } else {
-      iter->second.undefMask |= undefMask;
-      iter->second.usedMask |= usedMask;
+      // Keep the location if we ever output a non-undef value to it.
+      if (!isUndefVal)
+        iter->second.isLocKept = true;
       iter->second.candidateCalls.push_back(call);
     }
   }
@@ -3817,7 +3891,7 @@ void PatchResourceCollect::clearUndefinedOutput() {
   // Check if all used channels are undefined in a location in a stream
   for (auto &locCandidate : locCandidateInfoMap) {
     auto candidateCalls = locCandidate.second.candidateCalls;
-    if (locCandidate.second.usedMask != locCandidate.second.undefMask) {
+    if (locCandidate.second.isLocKept) {
       m_outputCalls.insert(m_outputCalls.end(), candidateCalls.begin(), candidateCalls.end());
       for (auto call : candidateCalls) {
         assert(call->arg_size());
diff --git a/lgc/patch/ConfigBuilderBase.cpp b/lgc/patch/ConfigBuilderBase.cpp
index 0823ef161f..08c6b838e2 100644
--- a/lgc/patch/ConfigBuilderBase.cpp
+++ b/lgc/patch/ConfigBuilderBase.cpp
@@ -310,6 +310,17 @@ void ConfigBuilderBase::setThreadgroupDimensions(llvm::ArrayRef<unsigned> values
     arrayNode[i] = values[i];
 }
 
+// =====================================================================================================================
+// Set original thread group dimensions
+//
+// @param values : Values to set
+void ConfigBuilderBase::setOrigThreadgroupDimensions(llvm::ArrayRef<unsigned> values) {
+  auto hwShaderNode = getHwShaderNode(Util::Abi::HardwareStage::Cs);
+  auto &arrayNode = hwShaderNode[Util::Abi::HardwareStageMetadataKey::OrigThreadgroupDimensions].getArray(true);
+  for (unsigned i = 0; i < values.size(); ++i)
+    arrayNode[i] = values[i];
+}
+
 // =====================================================================================================================
 // Set stream-out vertex strides
 //
diff --git a/lgc/patch/ConfigBuilderBase.h b/lgc/patch/ConfigBuilderBase.h
index 873a036270..d34e7ff674 100644
--- a/lgc/patch/ConfigBuilderBase.h
+++ b/lgc/patch/ConfigBuilderBase.h
@@ -85,6 +85,7 @@ class ConfigBuilderBase {
   void setLdsSizeByteSize(Util::Abi::HardwareStage hwStage, unsigned value);
   void setNggSubgroupSize(unsigned value);
   void setThreadgroupDimensions(llvm::ArrayRef<unsigned> values);
+  void setOrigThreadgroupDimensions(llvm::ArrayRef<unsigned> values);
   void setStreamOutVertexStrides(llvm::ArrayRef<unsigned> values);
   unsigned setupFloatingPointMode(ShaderStageEnum shaderStage);
 
diff --git a/lgc/patch/Continufy.cpp b/lgc/patch/Continufy.cpp
index 795cac042c..fcf7ba23e9 100644
--- a/lgc/patch/Continufy.cpp
+++ b/lgc/patch/Continufy.cpp
@@ -32,6 +32,7 @@
 
 #include "lgc/patch/Continufy.h"
 #include "compilerutils/CompilerUtils.h"
+#include "llvmraytracing/ContinuationsUtil.h"
 #include "lgc/Builder.h"
 #include "lgc/LgcCpsDialect.h"
 #include "lgc/LgcDialect.h"
@@ -174,6 +175,7 @@ PreservedAnalyses Continufy::run(Module &module, ModuleAnalysisManager &analysis
         SmallVector<Value *> tailArgs = {PoisonValue::get(builder.getInt32Ty())};
         tailArgs.append(call.arg_begin(), call.arg_end());
         auto *newCall = builder.create<AwaitOp>(call.getType(), continuationRef, 1u << (unsigned)calleeLevel, tailArgs);
+        ContHelper::ReturnedRegisterCount::setValue(newCall, 0);
         call.replaceAllUsesWith(newCall);
         tobeErased.push_back(&call);
       }
@@ -195,7 +197,8 @@ PreservedAnalyses Continufy::run(Module &module, ModuleAnalysisManager &analysis
             tailArgs.push_back(retValue);
 
           builder.create<JumpOp>(fnPtr->getArg(1), getReturnedLevels(currentRtStage.value()),
-                                 PoisonValue::get(StructType::get(context, {})) /* state */, poisonI32, tailArgs);
+                                 PoisonValue::get(StructType::get(context, {})) /* state */, poisonI32 /* csp */,
+                                 poisonI32 /* rcr */, tailArgs);
         }
 
         builder.CreateUnreachable();
diff --git a/lgc/patch/FragColorExport.cpp b/lgc/patch/FragmentColorExport.cpp
similarity index 99%
rename from lgc/patch/FragColorExport.cpp
rename to lgc/patch/FragmentColorExport.cpp
index 52a5797573..27e17693d1 100644
--- a/lgc/patch/FragColorExport.cpp
+++ b/lgc/patch/FragmentColorExport.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  FragColorExport.cpp
+ * @file  FragmentColorExport.cpp
  * @brief LLPC source file: contains implementation of class lgc::FragColorExport.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/FragmentColorExport.h"
 #include "lgc/LgcContext.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/patch/ShaderInputs.h"
diff --git a/lgc/patch/GenerateCopyShader.cpp b/lgc/patch/GenerateCopyShader.cpp
index ad4c50a5a1..b962b0a744 100644
--- a/lgc/patch/GenerateCopyShader.cpp
+++ b/lgc/patch/GenerateCopyShader.cpp
@@ -112,30 +112,18 @@ PreservedAnalyses GenerateCopyShader::run(Module &module, ModuleAnalysisManager
     // the argument definitions are decided by compiler not by HW. We could have such variable layout (not fixed with
     // GPU generation evolvement):
     //
-    // GFX10:
     //   void copyShader(
     //     i32 vertexIndex)
-    //
-    // GFX11+:
-    //   void copyShader(
-    //     i32 inreg globalTable,
-    //     i32 vertexIndex)
-    if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 10) {
-      argTys = {int32Ty};
-      argInReg = {false};
-      argNames = {"vertexIndex"};
-    } else {
-      argTys = {int32Ty, int32Ty};
-      argInReg = {true, false};
-      argNames = {"globalTable", "vertexIndex"};
-    }
+    argTys = {int32Ty};
+    argInReg = {false};
+    argNames = {"vertexIndex"};
   }
 
   auto entryPointTy = FunctionType::get(builder.getVoidTy(), argTys, false);
 
   // Create function for the copy shader entrypoint, and insert it before the FS (if there is one).
   auto entryPoint =
-      createFunctionHelper(entryPointTy, GlobalValue::ExternalLinkage, &module, lgcName::CopyShaderEntryPoint);
+      createFunctionHelper(entryPointTy, GlobalValue::ExternalLinkage, &module, false, lgcName::CopyShaderEntryPoint);
   entryPoint->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
   entryPoint->setCallingConv(CallingConv::AMDGPU_VS);
 
diff --git a/lgc/patch/PatchNullFragShader.cpp b/lgc/patch/GenerateNullFragmentShader.cpp
similarity index 97%
rename from lgc/patch/PatchNullFragShader.cpp
rename to lgc/patch/GenerateNullFragmentShader.cpp
index 7347611f8d..7bf335278e 100644
--- a/lgc/patch/PatchNullFragShader.cpp
+++ b/lgc/patch/GenerateNullFragmentShader.cpp
@@ -24,13 +24,13 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchNullFragShader.cpp
+ * @file  GenerateNullFragmentShader.cpp
  * @brief LLPC source file: contains declaration and implementation of class lgc::PatchNullFragShader.
  ***********************************************************************************************************************
  */
-#include "PatchNullFragShader.h"
+#include "GenerateNullFragmentShader.h"
 #include "lgc/LgcContext.h"
-#include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/FragmentColorExport.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/IntrinsDefs.h"
 #include "lgc/state/PalMetadata.h"
diff --git a/lgc/patch/PatchNullFragShader.h b/lgc/patch/GenerateNullFragmentShader.h
similarity index 98%
rename from lgc/patch/PatchNullFragShader.h
rename to lgc/patch/GenerateNullFragmentShader.h
index a9e0f70066..c836dd2cbf 100644
--- a/lgc/patch/PatchNullFragShader.h
+++ b/lgc/patch/GenerateNullFragmentShader.h
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchNullFragShader.h
+ * @file  GenerateNullFragmentShader.h
  * @brief LLPC header file: contains declaration of class lgc::PatchNullFragShader.
  ***********************************************************************************************************************
  */
diff --git a/lgc/patch/IncludeLlvmIr.cpp b/lgc/patch/IncludeLlvmIr.cpp
index 9553f03706..56196f8bd0 100644
--- a/lgc/patch/IncludeLlvmIr.cpp
+++ b/lgc/patch/IncludeLlvmIr.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  IncludeLlvmIr.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchLlvmIrInclusion.
+ * @brief LLPC source file: contains implementation of class lgc::IncludeLlvmIr.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/IncludeLlvmIr.h"
@@ -33,7 +33,7 @@
 #include "lgc/state/PipelineState.h"
 #include "llvm/IR/Constants.h"
 
-#define DEBUG_TYPE "lgc-patch-llvm-ir-inclusion"
+#define DEBUG_TYPE "lgc-include-llvm-ir"
 
 using namespace llvm;
 using namespace lgc;
@@ -46,7 +46,7 @@ namespace lgc {
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchLlvmIrInclusion::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses IncludeLlvmIr::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
   if (!pipelineState->getOptions().includeIr)
     return PreservedAnalyses::all();
diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/LowerBufferOperations.cpp
similarity index 92%
rename from lgc/patch/PatchBufferOp.cpp
rename to lgc/patch/LowerBufferOperations.cpp
index d5b7c1dac2..3bdb4a204b 100644
--- a/lgc/patch/PatchBufferOp.cpp
+++ b/lgc/patch/LowerBufferOperations.cpp
@@ -24,12 +24,13 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchBufferOp.cpp
+ * @file  LowerBufferOperations.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchBufferOp.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchBufferOp.h"
+#include "lgc/patch/LowerBufferOperations.h"
 #include "lgc/Builder.h"
+#include "lgc/CommonDefs.h"
 #include "lgc/LgcContext.h"
 #include "lgc/LgcDialect.h"
 #include "lgc/builder/BuilderImpl.h"
@@ -38,6 +39,7 @@
 #include "lgc/state/TargetInfo.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/AtomicOrdering.h"
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 458033
 // Old version of the code
 #include "llvm/Analysis/DivergenceAnalysis.h"
@@ -199,6 +201,7 @@ void BufferOpLowering::registerVisitors(llvm_dialects::VisitorBuilder<BufferOpLo
   builder.add(&BufferOpLowering::visitBitCastInst);
   builder.add(&BufferOpLowering::visitBufferAddrToPtr);
   builder.add(&BufferOpLowering::visitBufferDescToPtr);
+  builder.add(&BufferOpLowering::visitConvertToStridedBufferPointer);
   builder.add(&BufferOpLowering::visitStridedBufferDescToPtr);
   builder.add(&BufferOpLowering::visitBufferLoadDescToPtr);
   builder.add(&BufferOpLowering::visitStridedBufferLoadDescToPtr);
@@ -216,6 +219,7 @@ void BufferOpLowering::registerVisitors(llvm_dialects::VisitorBuilder<BufferOpLo
   builder.add(&BufferOpLowering::visitStoreInst);
   builder.add(&BufferOpLowering::visitICmpInst);
   builder.addIntrinsic(Intrinsic::invariant_start, &BufferOpLowering::visitInvariantStart);
+  builder.addIntrinsic(Intrinsic::amdgcn_readfirstlane, &BufferOpLowering::visitReadFirstLane);
 }
 
 // =====================================================================================================================
@@ -249,7 +253,7 @@ void BufferOpLowering::finish() {
 
   SmallVector<Instruction *> instructions;
   std::swap(instructions, m_postVisitInsts);
-  for (Instruction *inst : instructions)
+  for (Instruction *inst : llvm::reverse(instructions))
     visitor.visit(*this, *inst);
   assert(m_postVisitInsts.empty());
 }
@@ -437,10 +441,19 @@ void BufferOpLowering::visitAtomicCmpXchgInst(AtomicCmpXchgInst &atomicCmpXchgIn
     if (m_pipelineState.getTargetInfo().getGfxIpVersion().major <= 11)
       coherent.bits.slc = isNonTemporal ? 1 : 0;
 
-    Value *const atomicCall = m_builder.CreateIntrinsic(
-        Intrinsic::amdgcn_raw_buffer_atomic_cmpswap, atomicCmpXchgInst.getNewValOperand()->getType(),
-        {atomicCmpXchgInst.getNewValOperand(), atomicCmpXchgInst.getCompareOperand(), bufferDesc, baseIndex,
-         m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+    Value *atomicCall;
+    if (atomicCmpXchgInst.getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) {
+      Value *const index = values[2];
+      atomicCall = m_builder.CreateIntrinsic(Intrinsic::amdgcn_struct_buffer_atomic_cmpswap, storeType,
+                                             {atomicCmpXchgInst.getNewValOperand(),
+                                              atomicCmpXchgInst.getCompareOperand(), bufferDesc, index, baseIndex,
+                                              m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+    } else {
+      atomicCall = m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_buffer_atomic_cmpswap, storeType,
+                                             {atomicCmpXchgInst.getNewValOperand(),
+                                              atomicCmpXchgInst.getCompareOperand(), bufferDesc, baseIndex,
+                                              m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+    }
 
     switch (atomicCmpXchgInst.getSuccessOrdering()) {
     case AtomicOrdering::Acquire:
@@ -531,45 +544,58 @@ void BufferOpLowering::visitAtomicRMWInst(AtomicRMWInst &atomicRmwInst) {
       }
       }
       Intrinsic::ID intrinsic = Intrinsic::not_intrinsic;
+      bool isStructBuffer = atomicRmwInst.getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER;
       switch (atomicRmwInst.getOperation()) {
       case AtomicRMWInst::Xchg:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_swap;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_swap : Intrinsic::amdgcn_raw_buffer_atomic_swap;
         break;
       case AtomicRMWInst::Add:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_add;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_add : Intrinsic::amdgcn_raw_buffer_atomic_add;
         break;
       case AtomicRMWInst::Sub:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_sub;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_sub : Intrinsic::amdgcn_raw_buffer_atomic_sub;
         break;
       case AtomicRMWInst::And:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_and;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_and : Intrinsic::amdgcn_raw_buffer_atomic_and;
         break;
       case AtomicRMWInst::Or:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_or;
+        intrinsic = isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_or : Intrinsic::amdgcn_raw_buffer_atomic_or;
         break;
       case AtomicRMWInst::Xor:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_xor;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_xor : Intrinsic::amdgcn_raw_buffer_atomic_xor;
         break;
       case AtomicRMWInst::Max:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_smax;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_smax : Intrinsic::amdgcn_raw_buffer_atomic_smax;
         break;
       case AtomicRMWInst::Min:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_smin;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_smin : Intrinsic::amdgcn_raw_buffer_atomic_smin;
         break;
       case AtomicRMWInst::UMax:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_umax;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_umax : Intrinsic::amdgcn_raw_buffer_atomic_umax;
         break;
       case AtomicRMWInst::UMin:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_umin;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_umin : Intrinsic::amdgcn_raw_buffer_atomic_umin;
         break;
       case AtomicRMWInst::FAdd:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_fadd;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_fadd : Intrinsic::amdgcn_raw_buffer_atomic_fadd;
         break;
       case AtomicRMWInst::FMax:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_fmax;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_fmax : Intrinsic::amdgcn_raw_buffer_atomic_fmax;
         break;
       case AtomicRMWInst::FMin:
-        intrinsic = Intrinsic::amdgcn_raw_buffer_atomic_fmin;
+        intrinsic =
+            isStructBuffer ? Intrinsic::amdgcn_struct_buffer_atomic_fmin : Intrinsic::amdgcn_raw_buffer_atomic_fmin;
         break;
       default:
         llvm_unreachable("Should never be called!");
@@ -581,9 +607,17 @@ void BufferOpLowering::visitAtomicRMWInst(AtomicRMWInst &atomicRmwInst) {
         coherent.bits.slc = isNonTemporal ? 1 : 0;
       }
 
-      Value *const atomicCall = m_builder.CreateIntrinsic(intrinsic, storeType,
-                                                          {atomicRmwInst.getValOperand(), bufferDesc, baseIndex,
-                                                           m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+      Value *atomicCall;
+      if (isStructBuffer) {
+        Value *const index = values[2];
+        atomicCall = m_builder.CreateIntrinsic(intrinsic, storeType,
+                                               {atomicRmwInst.getValOperand(), bufferDesc, index, baseIndex,
+                                                m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+      } else {
+        atomicCall = m_builder.CreateIntrinsic(intrinsic, storeType,
+                                               {atomicRmwInst.getValOperand(), bufferDesc, baseIndex,
+                                                m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+      }
       copyMetadata(atomicCall, &atomicRmwInst);
 
       switch (atomicRmwInst.getOrdering()) {
@@ -721,6 +755,37 @@ void BufferOpLowering::visitBufferDescToPtr(BufferDescToPtrOp &descToPtr) {
   LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
 }
 
+// =====================================================================================================================
+// Visits "convert.to.strided.buffer.pointer" instruction.
+//
+// @param convertToStrided : The instruction
+void BufferOpLowering::visitConvertToStridedBufferPointer(ConvertToStridedBufferPointerOp &convertToStrided) {
+  auto values = m_typeLowering.getValue(convertToStrided.getPtr());
+
+  m_builder.SetInsertPoint(&convertToStrided);
+
+  auto *oldDescriptor = values[0];
+
+  auto *currentDword1 = m_builder.CreateExtractElement(oldDescriptor, 1);
+  auto *stride = m_builder.getInt32(convertToStrided.getStride());
+  auto *newDword1 = m_builder.CreateAnd(currentDword1, ~0x3FFF0000);
+  newDword1 = m_builder.CreateOr(newDword1, m_builder.CreateShl(stride, 16));
+  auto *newDescriptor = m_builder.CreateInsertElement(oldDescriptor, newDword1, 1);
+
+  auto *currentNumRecords = m_builder.CreateExtractElement(newDescriptor, 2);
+  auto *newNumRecords = m_builder.CreateUDiv(currentNumRecords, stride);
+  newDescriptor = m_builder.CreateInsertElement(newDescriptor, newNumRecords, 2);
+
+  auto *currentDword3 = m_builder.CreateExtractElement(newDescriptor, 3);
+  currentDword3 = m_builder.CreateAnd(currentDword3, 0xCFFFFFFF);
+  currentDword3 = m_builder.CreateOr(currentDword3, 0x10000000);
+  newDescriptor = m_builder.CreateInsertElement(newDescriptor, currentDword3, 3);
+
+  m_typeLowering.replaceInstruction(&convertToStrided, {newDescriptor, values[1], m_builder.getInt32(0)});
+
+  m_descriptors[newDescriptor] = m_descriptors[oldDescriptor];
+}
+
 // =====================================================================================================================
 // Visits "strided.buffer.desc.to.ptr" instruction.
 //
@@ -1154,6 +1219,22 @@ void BufferOpLowering::visitInvariantStart(llvm::IntrinsicInst &intrinsic) {
   m_typeLowering.eraseInstruction(&intrinsic);
 }
 
+// =====================================================================================================================
+// Visits read first lane intrinsic.
+//
+// @param intrinsic : The intrinsic
+void BufferOpLowering::visitReadFirstLane(llvm::IntrinsicInst &intrinsic) {
+  if (!isAnyBufferPointer(&intrinsic))
+    return;
+
+  auto values = m_typeLowering.getValue(intrinsic.getArgOperand(0));
+  Value *desc = values[0];
+  Value *ptr = values[1];
+  ptr = m_builder.CreateIntrinsic(ptr->getType(), Intrinsic::amdgcn_readfirstlane, ptr);
+
+  m_typeLowering.replaceInstruction(&intrinsic, {desc, ptr});
+}
+
 // =====================================================================================================================
 // Post-process visits "memcpy" instruction.
 //
@@ -1659,18 +1740,25 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
         accessSizeAllowed = accessSize >= 4;
       }
 
-      Value *indexValue = pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER
-                              ? pointerValues[2]
-                              : nullptr;
-      if (isInvariant && !isDivergentDesc && accessSizeAllowed) {
+      bool isStridedPointer = pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER;
+      Value *indexValue = isStridedPointer ? pointerValues[2] : nullptr;
+
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 458033
+      // Old version of the code
+      const bool isDivergentPtr = m_uniformityInfo.isDivergent(*pointerOperand);
+#else
+      // New version of the code (also handles unknown version, which we treat as latest)
+      const bool isDivergentPtr = m_uniformityInfo.isDivergent(pointerOperand);
+#endif
+      if (isInvariant && !isDivergentDesc && accessSizeAllowed &&
+          (!indexValue || isa<ConstantInt>(indexValue) || !isDivergentPtr)) {
         // create s.buffer.load
         Value *desc = bufferDesc;
         if (isIndexedDesc)
           desc = m_builder.CreateLoad(FixedVectorType::get(m_builder.getInt32Ty(), 4), bufferDesc);
-        if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) {
+        if (isStridedPointer) {
           // Especially when the index is a constant, and the stride is known at compile-time,
           // we should create s_buffer_load instructions with constant offsets: index * stride + offset
-          assert(isa<ConstantInt>(indexValue));
           Value *desc1 = m_builder.CreateExtractElement(desc, 1);
           // stride is 61:48 bits in descriptor, which will always be constantInt when create BufferDesc
           Value *stride =
@@ -1682,12 +1770,16 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
         CallInst *call = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_buffer_load, intAccessType,
                                                    {desc, offsetVal, m_builder.getInt32(coherent.u32All)});
         call->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
-        copyMetadata(call, &inst);
         part = call;
       } else {
         if (indexValue) {
+          Intrinsic::ID intrinsic = Intrinsic::amdgcn_struct_buffer_load;
+#if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 506212
+          if (ordering != AtomicOrdering::NotAtomic)
+            intrinsic = Intrinsic::amdgcn_struct_atomic_buffer_load;
+#endif
           part = m_builder.CreateIntrinsic(
-              Intrinsic::amdgcn_struct_buffer_load, intAccessType,
+              intrinsic, intAccessType,
               {getBufferDesc(), indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
         } else {
           unsigned intrinsicID = Intrinsic::amdgcn_raw_buffer_load;
diff --git a/lgc/patch/LowerCooperativeMatrix.cpp b/lgc/patch/LowerCooperativeMatrix.cpp
index bec5cec5d7..c012d076ef 100644
--- a/lgc/patch/LowerCooperativeMatrix.cpp
+++ b/lgc/patch/LowerCooperativeMatrix.cpp
@@ -60,6 +60,7 @@ static const Intrinsic::AMDGCNIntrinsics GetWmmaIntrinsic(GfxIpVersion gfxIp, Co
       return isTiled ? Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied : Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
     if (typeC == CooperativeMatrixElementType::Float32)
       return Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
+    break;
   }
   case CooperativeMatrixElementType::BFloat16: {
     assert(typeA == typeB);
@@ -67,10 +68,18 @@ static const Intrinsic::AMDGCNIntrinsics GetWmmaIntrinsic(GfxIpVersion gfxIp, Co
       return isTiled ? Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied : Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
     if (typeC == CooperativeMatrixElementType::Float32)
       return Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
+    break;
   }
   case CooperativeMatrixElementType::Int8: {
     if (typeC == CooperativeMatrixElementType::Int32)
       return Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
+    break;
+  }
+  case CooperativeMatrixElementType::Int4: {
+    assert(typeA == typeB);
+    if (typeC == CooperativeMatrixElementType::Int32)
+      return Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
+    break;
   }
   default:
     break;
@@ -166,6 +175,10 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties
     props.numMatrixElements = 16;
     props.numMatrixWords = 4;
     break;
+  case CooperativeMatrixElementType::Int4:
+    props.numMatrixElements = 8;
+    props.numMatrixWords = 2;
+    break;
   default:
     llvm_unreachable("unknown element type");
   }
@@ -173,9 +186,10 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties
   auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
   if (layout == CooperativeMatrixLayout::FactorMatrixLayout) {
     assert(elemType != CooperativeMatrixElementType::Float32 && elemType != CooperativeMatrixElementType::Int32);
-    props.numFlatElements = 16;
+    props.numFlatElements = BuilderCommon::isTypeNCooperativeMatrix(elemType, 4) ? 8 : 16;
   } else if (layout == CooperativeMatrixLayout::AccumulatorMatrixLayout) {
-    if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 16)) {
+    if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 16) &&
+        (elemType != CooperativeMatrixElementType::Float16Packed)) {
       props.matrixElementStride = 2;
     }
     if (elemType == CooperativeMatrixElementType::Float16Packed) {
@@ -216,7 +230,8 @@ Value *LowerCooperativeMatrix::convFlatVecToCoopMatrixVec(BuilderCommon &builder
   }
 
   Type *wordTy = vecValue->getType()->isIntOrIntVectorTy() ? builder.getInt32Ty() : builder.getFloatTy();
-  return builder.CreateBitCast(vecValue, FixedVectorType::get(wordTy, props.numMatrixWords));
+  return builder.CreateBitCast(vecValue,
+                               props.numMatrixWords == 1 ? wordTy : FixedVectorType::get(wordTy, props.numMatrixWords));
 }
 
 // =====================================================================================================================
@@ -230,8 +245,10 @@ Value *LowerCooperativeMatrix::convCoopMatrixVecToFlatVec(BuilderCommon &builder
                                                           CooperativeMatrixElementType elemType,
                                                           CooperativeMatrixLayout layout) {
   auto props = getTypeProperties(elemType, layout);
-
-  Type *flatType = FixedVectorType::get(builder.transCooperativeMatrixElementType(elemType), props.numMatrixElements);
+  Type *elemTy = builder.transCooperativeMatrixElementType(elemType);
+  if (elemTy->getScalarSizeInBits() < 8)
+    elemTy = builder.getInt8Ty();
+  Type *flatType = FixedVectorType::get(elemTy, props.numMatrixElements);
   Value *tmp = builder.CreateBitCast(matrixValue, flatType);
 
   if (props.numFlatElements < props.numMatrixElements) {
@@ -350,6 +367,8 @@ void LowerCooperativeMatrix::visitCooperativeMatrixLoadOp(CooperativeMatrixLoadO
 
   // Calc element offset in memory
   Type *elemTy = builder.transCooperativeMatrixElementType(elemType);
+  if (elemType == CooperativeMatrixElementType::Int4)
+    elemTy = builder.getInt8Ty();
   const unsigned dataBitwidth = elemTy->getScalarSizeInBits();
   const unsigned addrSpace = dataPtr->getType()->getPointerAddressSpace();
   assert(addrSpace == ADDR_SPACE_LOCAL || addrSpace == ADDR_SPACE_BUFFER_FAT_POINTER || addrSpace == ADDR_SPACE_GLOBAL);
@@ -416,7 +435,10 @@ void LowerCooperativeMatrix::visitCooperativeMatrixStoreOp(CooperativeMatrixStor
 
   // Calc element offset in memory
   Type *elemTy = builder.transCooperativeMatrixElementType(elemType);
-  const unsigned dataBitwidth = elemTy->getScalarSizeInBits();
+  if (elemType == CooperativeMatrixElementType::Int4)
+    elemTy = builder.getInt8Ty();
+
+  unsigned dataBitwidth = elemTy->getScalarSizeInBits();
   const unsigned addrSpace = dataPtr->getType()->getPointerAddressSpace();
   assert(addrSpace == ADDR_SPACE_LOCAL || addrSpace == ADDR_SPACE_BUFFER_FAT_POINTER || addrSpace == ADDR_SPACE_GLOBAL);
 
@@ -573,6 +595,63 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvertInternal(CastInst::CastOp
     source = builder.CreateBitCast(source, bfloat16Vec);
   }
 
+  auto createTruncFunc = [](BuilderBase &builder, Value *source, Type *dstType) -> Value * {
+    const unsigned numDstBits = dstType->getScalarSizeInBits();
+    if (numDstBits == 32)
+      return source;
+    if (numDstBits == 8 || numDstBits == 16)
+      return builder.CreateTrunc(source, dstType);
+    assert(numDstBits == 4);
+    // Truncate an integer into int4 via packing two continuous i4 data in a byte
+    SmallVector<Value *> elems;
+    const unsigned srcVecSize = cast<FixedVectorType>(source->getType())->getNumElements();
+    auto vecI8 = builder.CreateTrunc(source, FixedVectorType::get(builder.getInt8Ty(), srcVecSize));
+    for (unsigned i = 0; i < srcVecSize; ++i) {
+      auto elem = builder.CreateExtractElement(vecI8, i);
+      if (i & 1)
+        elem = builder.CreateShl(elem, 4);
+      else
+        elem = builder.CreateAnd(elem, builder.getInt8(0xF));
+      elems.push_back(elem);
+    }
+    // Merge two 4-bit integers into one 8-bit integer
+    SmallVector<Value *> mergedElems;
+    const unsigned dstVecSize = srcVecSize / 2;
+    Value *resultValue = PoisonValue::get(FixedVectorType::get(builder.getInt8Ty(), dstVecSize));
+    for (unsigned i = 0; i < dstVecSize; ++i) {
+      Value *elem = builder.CreateOr(elems[2 * i], elems[2 * i + 1]);
+      resultValue = builder.CreateInsertElement(resultValue, elem, i);
+    }
+    return resultValue;
+  };
+
+  auto createExtFunc = [](BuilderBase &builder, Value *source, Type *dstType, CastInst::CastOps castOp) -> Value * {
+    // Split an i8 into two i4
+    SmallVector<Value *> elems;
+    const bool isSigned = castOp == Instruction::SExt || castOp == Instruction::SIToFP;
+    const unsigned srcVecSize = cast<FixedVectorType>(source->getType())->getNumElements();
+    for (unsigned i = 0; i < srcVecSize; ++i) {
+      Value *elem = builder.CreateExtractElement(source, i);
+      Value *elemLow = builder.CreateAnd(elem, builder.getInt8(0xF));
+      Value *elemHigh = isSigned ? builder.CreateAShr(elem, 4) : builder.CreateLShr(elem, 4);
+      elems.push_back(elemLow);
+      elems.push_back(elemHigh);
+    }
+    // Perform the extending operation
+    const bool isExtInst = castOp == Instruction::SExt || castOp == Instruction::ZExt;
+    Type *dstElemTy = cast<FixedVectorType>(dstType)->getElementType();
+    Value *resultValue = PoisonValue::get(FixedVectorType::get(dstElemTy, elems.size()));
+    for (auto [index, elem] : enumerate(elems)) {
+      if (isExtInst)
+        elem = isSigned ? builder.CreateSExt(elem, dstElemTy) : builder.CreateZExt(elem, dstElemTy);
+      else
+        elem = builder.CreateCast(castOp, elem, dstElemTy);
+
+      resultValue = builder.CreateInsertElement(resultValue, elem, index);
+    }
+    return resultValue;
+  };
+
   if ((srcElemType == CooperativeMatrixElementType::Float16 || srcElemType == CooperativeMatrixElementType::BFloat16 ||
        srcElemType == CooperativeMatrixElementType::Float32) &&
       (castOp == Instruction::FPToUI || castOp == Instruction::FPToSI)) {
@@ -581,15 +660,19 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvertInternal(CastInst::CastOp
     // Fix the error in: dEQP-VK.compute.cooperative_matrix.nv.convert.input_float16/32_t_output_uint8_t*
     resultValue =
         builder.CreateCast(castOp, source, FixedVectorType::get(builder.getInt32Ty(), vecSize), "ConvertIntoInt32");
-    if (builder.transCooperativeMatrixElementType(dstElemType)->getScalarSizeInBits() < 32) {
-      resultValue = builder.CreateTrunc(resultValue, dstType);
-    }
+    resultValue = createTruncFunc(builder, resultValue, dstType);
   } else if (castOp == Instruction::FPTrunc && (srcElemType == CooperativeMatrixElementType::Float16 ||
                                                 srcElemType == CooperativeMatrixElementType::BFloat16)) {
     // Float16 -> BFloat16 or BFloat16 -> Float16
     resultValue = builder.CreateCast(Instruction::FPExt, source, FixedVectorType::get(builder.getFloatTy(), vecSize),
                                      "Convert16tofloat32");
     resultValue = builder.CreateFPTrunc(resultValue, dstType);
+  } else if (castOp == Instruction::Trunc &&
+             (srcElemType == CooperativeMatrixElementType::Int8 || srcElemType == CooperativeMatrixElementType::Int16 ||
+              srcElemType == CooperativeMatrixElementType::Int32)) {
+    resultValue = createTruncFunc(builder, source, dstType);
+  } else if (srcElemType == CooperativeMatrixElementType::Int4) {
+    resultValue = createExtFunc(builder, source, dstType, castOp);
   } else
     resultValue = builder.CreateCast(castOp, source, dstType, "castOpConvert");
 
@@ -623,8 +706,10 @@ void LowerCooperativeMatrix::visitCooperativeMatrixConvertOp(CooperativeMatrixCo
       const unsigned vecNums = cast<FixedVectorType>(source->getType())->getNumElements();
       source = builder.CreateBitCast(source, FixedVectorType::get(builder.getInt32Ty(), vecNums));
     }
-    resultValue = cooperativeMatrixReshape16BitElementGfx1011(source, srcElemType, srcLayout, dstLayout, threadId,
-                                                              convert.getName(), &convert);
+    {
+      resultValue = cooperativeMatrixReshape16BitElementGfx1011(source, srcElemType, srcLayout, dstLayout, threadId,
+                                                                convert.getName(), &convert);
+    }
   } else {
     unsigned numSrcBit = builder.transCooperativeMatrixElementType(srcElemType)->getScalarSizeInBits();
     unsigned numDstBit = builder.transCooperativeMatrixElementType(dstElemType)->getScalarSizeInBits();
@@ -1447,29 +1532,36 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
   if (m_gfxIp.major >= 11) {
     Value *matrixD;
     unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage.value());
+    unsigned factorFlatElemNum = 0;
+    unsigned matrixLength = 0;
 
     if (BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 16)) {
       assert(matrixAType == matrixBType);
-      unsigned factorFlatElemNum = 0;
       { factorFlatElemNum = 16; }
       Type *factorType =
           FixedVectorType::get(builder.transCooperativeMatrixElementType(matrixAType), factorFlatElemNum);
       matrixA = builder.CreateBitCast(matrixA, factorType);
       matrixB = builder.CreateBitCast(matrixB, factorType);
     } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 8)) {
-    } else {
+    } else if (!BuilderCommon::isTypeNCooperativeMatrix(matrixAType, 4)) {
       llvm_unreachable("Factor element type is not supported!");
     }
 
     if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 32)) {
-      matrixC =
-          waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector") : matrixC;
-    } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) {
-      {
+      if (m_gfxIp.major <= 12)
         matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector")
                                  : matrixC;
+    } else if (BuilderCommon::isTypeNCooperativeMatrix(matrixCType, 16)) {
+      {
+        if (m_gfxIp.major == 12) {
+          // When gfxIp.major > 12, waveSize will always be 32 then matrixC size is solid without any necessary swizzle.
+          matrixC =
+              waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1}), "shuffleVector") : matrixC;
+        } else { // m_gfxIp.major <= 11
+          matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef<int>({0, 1, 2, 3}), "shuffleVector")
+                                   : matrixC;
+        }
       }
-      unsigned matrixLength = cast<FixedVectorType>(matrixC->getType())->getNumElements();
 
       Type *castType = nullptr;
       if (matrixCType == CooperativeMatrixElementType::BFloat16) {
@@ -1477,13 +1569,16 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
         castType = builder.getInt16Ty();
       } else
         castType = builder.getHalfTy();
+      matrixLength = cast<FixedVectorType>(matrixC->getType())->getNumElements();
       Type *accumType = FixedVectorType::get(castType, matrixLength * 2);
       matrixC = builder.CreateBitCast(matrixC, accumType);
     } else {
       llvm_unreachable("Accumulator element type is not supported!");
     }
 
-    auto intrinsic = GetWmmaIntrinsic(m_gfxIp, matrixAType, matrixBType, matrixCType, muladd.getIsTied());
+    Intrinsic::AMDGCNIntrinsics intrinsic = InvalidInstricID;
+    intrinsic = GetWmmaIntrinsic(m_gfxIp, matrixAType, matrixBType, matrixCType, muladd.getIsTied());
+
     if (intrinsic == InvalidInstricID)
       llvm_unreachable("HW intrinsics not supported!");
 
@@ -1512,6 +1607,14 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
       args.push_back(matrixC);
       args.push_back(builder.getInt1(isSatOrOpsel));
       break;
+    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
+      args.push_back(builder.getInt1(isSignedA));
+      args.push_back(matrixA);
+      args.push_back(builder.getInt1(isSignedB));
+      args.push_back(matrixB);
+      args.push_back(matrixC);
+      args.push_back(builder.getInt1(isSatOrOpsel));
+      break;
     default:
       llvm_unreachable("Should never be called!");
       break;
diff --git a/lgc/patch/LowerDebugPrintf.cpp b/lgc/patch/LowerDebugPrintf.cpp
index 8f9e0d7e97..cc8a512dee 100644
--- a/lgc/patch/LowerDebugPrintf.cpp
+++ b/lgc/patch/LowerDebugPrintf.cpp
@@ -75,7 +75,7 @@ PreservedAnalyses LowerDebugPrintf::run(Module &module, ModuleAnalysisManager &a
     return PreservedAnalyses::all();
 
   const ResourceNode *node = nullptr;
-  // LLpc node type is DescriptorBuffer
+  // Llpcfe debugPrintf node type is DescriptorMutable, LLpc node type is DescriptorBuffer
   // So use ResourceNodeType::Unknown to match different node type.
   std::tie(m_topNode, node) =
       pipelineState->findResourceNode(ResourceNodeType::Unknown, InternalDescriptorSetId, PrintfBufferBindingId);
diff --git a/lgc/patch/LowerGpuRt.cpp b/lgc/patch/LowerGpuRt.cpp
index fcc2eff4c3..6a7dfc24ac 100644
--- a/lgc/patch/LowerGpuRt.cpp
+++ b/lgc/patch/LowerGpuRt.cpp
@@ -77,7 +77,6 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi
                             .add(&LowerGpuRt::visitGetFlattenedGroupThreadId)
                             .add(&LowerGpuRt::visitFloatWithRoundMode)
                             .add(&LowerGpuRt::visitGpurtDispatchThreadIdFlatOp)
-                            .add(&LowerGpuRt::visitContinuationStackIsGlobalOp)
                             .add(&LowerGpuRt::visitWaveScanOp)
                             .add(&LowerGpuRt::visitGetKnownSetRayFlagsOp)
                             .add(&LowerGpuRt::visitGetKnownUnsetRayFlagsOp)
@@ -238,13 +237,12 @@ void LowerGpuRt::visitGetStackStride(GpurtGetStackStrideOp &inst) {
 void LowerGpuRt::visitStackRead(GpurtStackReadOp &inst) {
   m_builder->SetInsertPoint(&inst);
   Value *stackIndex = inst.getIndex();
-  Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3);
   if (inst.getUseExtraStack()) {
     auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries);
     stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize);
   }
 
-  Value *stackAddr = m_builder->CreateGEP(stackTy, m_stack, {stackIndex});
+  Value *stackAddr = m_builder->CreateGEP(m_builder->getInt32Ty(), m_stack, {stackIndex});
   Value *stackData = m_builder->CreateLoad(m_builder->getInt32Ty(), stackAddr);
 
   inst.replaceAllUsesWith(stackData);
@@ -260,13 +258,12 @@ void LowerGpuRt::visitStackWrite(GpurtStackWriteOp &inst) {
   m_builder->SetInsertPoint(&inst);
   Value *stackIndex = inst.getIndex();
   Value *stackData = inst.getValue();
-  Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3);
   if (inst.getUseExtraStack()) {
     auto ldsStackSize = m_builder->getInt32(m_workGroupSize * MaxLdsStackEntries);
     stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize);
   }
 
-  auto stackArrayAddr = m_builder->CreateGEP(stackTy, m_stack, {stackIndex});
+  auto stackArrayAddr = m_builder->CreateGEP(m_builder->getInt32Ty(), m_stack, {stackIndex});
   m_builder->CreateStore(stackData, stackArrayAddr);
 
   inst.replaceAllUsesWith(m_builder->getInt32(0));
@@ -530,18 +527,6 @@ void LowerGpuRt::visitGpurtDispatchThreadIdFlatOp(GpurtDispatchThreadIdFlatOp &i
   m_funcsToLower.insert(inst.getCalledFunction());
 }
 
-// =====================================================================================================================
-// Visit "GpurtContinuationStackIsGlobalOp" instruction
-//
-// @param inst : The dialect instruction to process
-void LowerGpuRt::visitContinuationStackIsGlobalOp(GpurtContinuationStackIsGlobalOp &inst) {
-  m_builder->SetInsertPoint(&inst);
-  bool isGlobal = m_pipelineState->getOptions().cpsFlags & CpsFlagStackInGlobalMem;
-  inst.replaceAllUsesWith(m_builder->getInt1(isGlobal));
-  m_callsToLower.push_back(&inst);
-  m_funcsToLower.insert(inst.getCalledFunction());
-}
-
 // =====================================================================================================================
 // Visit "GpurtGetKnownSetRayFlagsOp" instruction
 //
diff --git a/lgc/patch/LowerInOut.cpp b/lgc/patch/LowerInOut.cpp
index fdff15e302..eef8885f5e 100644
--- a/lgc/patch/LowerInOut.cpp
+++ b/lgc/patch/LowerInOut.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerInOut.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchInOutImportExport.
+ * @brief LLPC source file: contains implementation of class lgc::LowerInOut.
  *
  ***********************************************************************************************************************
  */
@@ -44,7 +44,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cmath>
 
-#define DEBUG_TYPE "lgc-patch-in-out-import-export"
+#define DEBUG_TYPE "lgc-lower-in-out"
 
 using namespace llvm;
 using namespace lgc;
@@ -55,14 +55,14 @@ namespace lgc {
 constexpr unsigned MaxHsThreadsPerSubgroup = 256;
 
 // =====================================================================================================================
-PatchInOutImportExport::PatchInOutImportExport() {
+LowerInOut::LowerInOut() {
   memset(&m_gfxIp, 0, sizeof(m_gfxIp));
   initPerShader();
 }
 
 // =====================================================================================================================
 // Initialize per-shader members
-void PatchInOutImportExport::initPerShader() {
+void LowerInOut::initPerShader() {
   m_clipDistance = nullptr;
   m_cullDistance = nullptr;
   m_primitiveId = nullptr;
@@ -84,7 +84,7 @@ void PatchInOutImportExport::initPerShader() {
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchInOutImportExport::run(Module &module, ModuleAnalysisManager &analysisManager) {
+PreservedAnalyses LowerInOut::run(Module &module, ModuleAnalysisManager &analysisManager) {
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
   PipelineShadersResult &pipelineShaders = analysisManager.getResult<PipelineShaders>(module);
   auto getPostDominatorTree = [&](Function &f) -> PostDominatorTree & {
@@ -92,7 +92,7 @@ PreservedAnalyses PatchInOutImportExport::run(Module &module, ModuleAnalysisMana
     return fam.getResult<PostDominatorTreeAnalysis>(f);
   };
 
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-In-Out-Import-Export\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-In-Out\n");
 
   Patch::init(&module);
 
@@ -175,10 +175,9 @@ PreservedAnalyses PatchInOutImportExport::run(Module &module, ModuleAnalysisMana
   return PreservedAnalyses::none();
 }
 
-void PatchInOutImportExport::processFunction(
-    Function &func, ShaderStageEnum shaderStage, SmallVectorImpl<Function *> &inputCallees,
-    SmallVectorImpl<Function *> &otherCallees,
-    const std::function<PostDominatorTree &(Function &)> &getPostDominatorTree) {
+void LowerInOut::processFunction(Function &func, ShaderStageEnum shaderStage, SmallVectorImpl<Function *> &inputCallees,
+                                 SmallVectorImpl<Function *> &otherCallees,
+                                 const std::function<PostDominatorTree &(Function &)> &getPostDominatorTree) {
   PostDominatorTree &postDomTree = getPostDominatorTree(func);
 
   initPerShader();
@@ -201,7 +200,7 @@ void PatchInOutImportExport::processFunction(
 //
 // @param [in/out] func : LLVM function to be run on
 // @param postDomTree : The PostDominatorTree of the \p func
-void PatchInOutImportExport::markExportDone(Function *func, PostDominatorTree &postDomTree) {
+void LowerInOut::markExportDone(Function *func, PostDominatorTree &postDomTree) {
   SmallVector<CallInst *, 4> expInsts;
 
   Function *expDecl = m_module->getFunction("llvm.amdgcn.exp.f32");
@@ -243,7 +242,7 @@ void PatchInOutImportExport::markExportDone(Function *func, PostDominatorTree &p
 
 // =====================================================================================================================
 // Process a single shader
-void PatchInOutImportExport::processShader() {
+void LowerInOut::processShader() {
   // Initialize the output value for gl_PrimitiveID
   const auto &builtInUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage.value())->builtInUsage;
   const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(m_shaderStage.value())->entryArgIdxs;
@@ -482,7 +481,7 @@ void PatchInOutImportExport::processShader() {
 // Visits all "call" instructions against the callee functions in current entry-point function.
 //
 // @param calleeFuncs : a list of candidate callee functions to check
-void PatchInOutImportExport::visitCallInsts(ArrayRef<Function *> calleeFuncs) {
+void LowerInOut::visitCallInsts(ArrayRef<Function *> calleeFuncs) {
   for (auto callee : calleeFuncs) {
     for (auto user : callee->users()) {
       if (CallInst *callInst = dyn_cast<CallInst>(user)) {
@@ -495,7 +494,7 @@ void PatchInOutImportExport::visitCallInsts(ArrayRef<Function *> calleeFuncs) {
 
 // =====================================================================================================================
 // Visits all "ret" instructions in current entry-point function.
-void PatchInOutImportExport::visitReturnInsts() {
+void LowerInOut::visitReturnInsts() {
   for (auto &block : *m_entryPoint)
     if (auto *retInst = dyn_cast<ReturnInst>(block.getTerminator()))
       visitReturnInst(*retInst);
@@ -505,7 +504,7 @@ void PatchInOutImportExport::visitReturnInsts() {
 // Visits "call" instruction.
 //
 // @param callInst : "Call" instruction
-void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
+void LowerInOut::visitCallInst(CallInst &callInst) {
   auto callee = callInst.getCalledFunction();
   if (!callee)
     return;
@@ -1112,7 +1111,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
 // Visits "ret" instruction.
 //
 // @param retInst : "Ret" instruction
-void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
+void LowerInOut::visitReturnInst(ReturnInst &retInst) {
   // We only handle the "ret" of shader entry point
   if (!m_shaderStage)
     return;
@@ -1586,8 +1585,8 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
 // @param compIdx : Index used for vector element indexing
 // @param vertexIdx : Input array outermost index used for vertex indexing
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchTcsGenericInputImport(Type *inputTy, unsigned location, Value *locOffset,
-                                                          Value *compIdx, Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::patchTcsGenericInputImport(Type *inputTy, unsigned location, Value *locOffset, Value *compIdx,
+                                              Value *vertexIdx, BuilderBase &builder) {
   assert(compIdx && vertexIdx);
 
   auto ldsOffset = calcLdsOffsetForTcsInput(inputTy, location, locOffset, compIdx, vertexIdx, builder);
@@ -1603,8 +1602,8 @@ Value *PatchInOutImportExport::patchTcsGenericInputImport(Type *inputTy, unsigne
 // @param compIdx : Index used for vector element indexing
 // @param vertexIdx : Input array outermost index used for vertex indexing (could be null)
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchTesGenericInputImport(Type *inputTy, unsigned location, Value *locOffset,
-                                                          Value *compIdx, Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::patchTesGenericInputImport(Type *inputTy, unsigned location, Value *locOffset, Value *compIdx,
+                                              Value *vertexIdx, BuilderBase &builder) {
   assert(compIdx);
 
   auto ldsOffset = calcLdsOffsetForTesInput(inputTy, location, locOffset, compIdx, vertexIdx, builder);
@@ -1619,8 +1618,8 @@ Value *PatchInOutImportExport::patchTesGenericInputImport(Type *inputTy, unsigne
 // @param compIdx : Index used for vector element indexing
 // @param vertexIdx : Input array outermost index used for vertex indexing
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchGsGenericInputImport(Type *inputTy, unsigned location, unsigned compIdx,
-                                                         Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::patchGsGenericInputImport(Type *inputTy, unsigned location, unsigned compIdx, Value *vertexIdx,
+                                             BuilderBase &builder) {
   assert(vertexIdx);
 
   const unsigned compCount = inputTy->isVectorTy() ? cast<FixedVectorType>(inputTy)->getNumElements() : 1;
@@ -1658,8 +1657,8 @@ Value *PatchInOutImportExport::patchGsGenericInputImport(Type *inputTy, unsigned
 // @param coordI: Value of I coordinate
 // @param coordJ: Value of J coordinate
 // @param primMask: Value to fill into m0 register
-Value *PatchInOutImportExport::performFsFloatInterpolation(BuilderBase &builder, Value *attr, Value *channel,
-                                                           Value *coordI, Value *coordJ, Value *primMask) {
+Value *LowerInOut::performFsFloatInterpolation(BuilderBase &builder, Value *attr, Value *channel, Value *coordI,
+                                               Value *coordJ, Value *primMask) {
   Value *result = nullptr;
   if (m_gfxIp.major >= 11) {
     // llvm.amdgcn.lds.param.load(attr_channel, attr, m0)
@@ -1693,9 +1692,8 @@ Value *PatchInOutImportExport::performFsFloatInterpolation(BuilderBase &builder,
 // @param coordJ: Value of J coordinate
 // @param primMask: Value to fill into m0 register
 // @param highHalf : Whether it is a high half in a 16-bit attribute
-Value *PatchInOutImportExport::performFsHalfInterpolation(BuilderBase &builder, Value *attr, Value *channel,
-                                                          Value *coordI, Value *coordJ, Value *primMask,
-                                                          Value *highHalf) {
+Value *LowerInOut::performFsHalfInterpolation(BuilderBase &builder, Value *attr, Value *channel, Value *coordI,
+                                              Value *coordJ, Value *primMask, Value *highHalf) {
   Value *result = nullptr;
   if (m_gfxIp.major >= 11) {
     // llvm.amdgcn.lds.param.load(attr_channel, attr, m0)
@@ -1739,9 +1737,8 @@ Value *PatchInOutImportExport::performFsHalfInterpolation(BuilderBase &builder,
 // @param primMask : Value to fill into m0 register
 // @param bitWidth : The bitwidth of required data type
 // @param highHalf : Whether it is a high half in a 16-bit attribute
-Value *PatchInOutImportExport::performFsParameterLoad(BuilderBase &builder, Value *attr, Value *channel,
-                                                      InterpParam interpParam, Value *primMask, unsigned bitWidth,
-                                                      bool highHalf) {
+Value *LowerInOut::performFsParameterLoad(BuilderBase &builder, Value *attr, Value *channel, InterpParam interpParam,
+                                          Value *primMask, unsigned bitWidth, bool highHalf) {
   Value *compValue = nullptr;
 
   if (m_gfxIp.major >= 11) {
@@ -1805,9 +1802,9 @@ Value *PatchInOutImportExport::performFsParameterLoad(BuilderBase &builder, Valu
 // vertex index; unused for "flat" mode or if the input is per-primitive
 // @param highHalf : Whether it is a high half in a 16-bit attribute
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchFsGenericInputImport(Type *inputTy, unsigned location, Value *locOffset,
-                                                         Value *compIdx, bool isPerPrimitive, unsigned interpMode,
-                                                         Value *interpValue, bool highHalf, BuilderBase &builder) {
+Value *LowerInOut::patchFsGenericInputImport(Type *inputTy, unsigned location, Value *locOffset, Value *compIdx,
+                                             bool isPerPrimitive, unsigned interpMode, Value *interpValue,
+                                             bool highHalf, BuilderBase &builder) {
   auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment);
   auto &interpInfo = resUsage->inOutUsage.fs.interpInfo;
 
@@ -1970,8 +1967,8 @@ Value *PatchInOutImportExport::patchFsGenericInputImport(Type *inputTy, unsigned
 // @param compIdx : Index used for vector element indexing
 // @param vertexIdx : Input array outermost index used for vertex indexing (could be null)
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchTcsGenericOutputImport(Type *outputTy, unsigned location, Value *locOffset,
-                                                           Value *compIdx, Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::patchTcsGenericOutputImport(Type *outputTy, unsigned location, Value *locOffset, Value *compIdx,
+                                               Value *vertexIdx, BuilderBase &builder) {
   assert(compIdx);
   auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, location, locOffset, compIdx, vertexIdx, builder);
   return readValueFromLds(true, outputTy, ldsOffset, builder);
@@ -1984,8 +1981,7 @@ Value *PatchInOutImportExport::patchTcsGenericOutputImport(Type *outputTy, unsig
 // @param location : Location of the output
 // @param compIdx : Index used for vector element indexing
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::patchVsGenericOutputExport(Value *output, unsigned location, unsigned compIdx,
-                                                        BuilderBase &builder) {
+void LowerInOut::patchVsGenericOutputExport(Value *output, unsigned location, unsigned compIdx, BuilderBase &builder) {
   auto outputTy = output->getType();
 
   if (m_hasTs) {
@@ -2022,8 +2018,8 @@ void PatchInOutImportExport::patchVsGenericOutputExport(Value *output, unsigned
 // @param compIdx : Index used for vector element indexing
 // @param vertexIdx : Input array outermost index used for vertex indexing (could be null)
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::patchTcsGenericOutputExport(Value *output, unsigned location, Value *locOffset,
-                                                         Value *compIdx, Value *vertexIdx, BuilderBase &builder) {
+void LowerInOut::patchTcsGenericOutputExport(Value *output, unsigned location, Value *locOffset, Value *compIdx,
+                                             Value *vertexIdx, BuilderBase &builder) {
   assert(compIdx);
   Type *outputTy = output->getType();
   auto ldsOffset = calcLdsOffsetForTcsOutput(outputTy, location, locOffset, compIdx, vertexIdx, builder);
@@ -2037,8 +2033,7 @@ void PatchInOutImportExport::patchTcsGenericOutputExport(Value *output, unsigned
 // @param location : Location of the output
 // @param compIdx : Index used for vector element indexing
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::patchTesGenericOutputExport(Value *output, unsigned location, unsigned compIdx,
-                                                         BuilderBase &builder) {
+void LowerInOut::patchTesGenericOutputExport(Value *output, unsigned location, unsigned compIdx, BuilderBase &builder) {
   if (m_hasGs) {
     auto outputTy = output->getType();
     assert(outputTy->isIntOrIntVectorTy() || outputTy->isFPOrFPVectorTy());
@@ -2068,8 +2063,8 @@ void PatchInOutImportExport::patchTesGenericOutputExport(Value *output, unsigned
 // @param compIdx : Index used for vector element indexing
 // @param streamId : ID of output vertex stream
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::patchGsGenericOutputExport(Value *output, unsigned location, unsigned compIdx,
-                                                        unsigned streamId, BuilderBase &builder) {
+void LowerInOut::patchGsGenericOutputExport(Value *output, unsigned location, unsigned compIdx, unsigned streamId,
+                                            BuilderBase &builder) {
   auto outputTy = output->getType();
 
   // Cast double or double vector to float vector.
@@ -2105,9 +2100,8 @@ void PatchInOutImportExport::patchGsGenericOutputExport(Value *output, unsigned
 // @param vertexOrPrimitiveIdx : Input array outermost index used for vertex or primitive indexing
 // @param isPerPrimitive : Whether the output is per-primitive
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::patchMeshGenericOutputExport(Value *output, unsigned location, Value *locOffset,
-                                                          Value *compIdx, Value *vertexOrPrimitiveIdx,
-                                                          bool isPerPrimitive, BuilderBase &builder) {
+void LowerInOut::patchMeshGenericOutputExport(Value *output, unsigned location, Value *locOffset, Value *compIdx,
+                                              Value *vertexOrPrimitiveIdx, bool isPerPrimitive, BuilderBase &builder) {
   if (output->getType()->getScalarSizeInBits() == 64)
     compIdx = builder.CreateShl(compIdx, 1);
 
@@ -2122,8 +2116,8 @@ void PatchInOutImportExport::patchMeshGenericOutputExport(Value *output, unsigne
 // @param elemIdx : Index used for array/vector element indexing (could be null)
 // @param vertexIdx : Input array outermost index used for vertex indexing (could be null)
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchTcsBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *elemIdx,
-                                                          Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::patchTcsBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *elemIdx, Value *vertexIdx,
+                                              BuilderBase &builder) {
   Value *input = PoisonValue::get(inputTy);
 
   auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::TessControl)->entryArgIdxs.tcs;
@@ -2212,8 +2206,8 @@ Value *PatchInOutImportExport::patchTcsBuiltInInputImport(Type *inputTy, unsigne
 // @param elemIdx : Index used for array/vector element indexing (could be null)
 // @param vertexIdx : Input array outermost index used for vertex indexing (could be null)
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchTesBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *elemIdx,
-                                                          Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::patchTesBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *elemIdx, Value *vertexIdx,
+                                              BuilderBase &builder) {
   Value *input = PoisonValue::get(inputTy);
 
   auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::TessEval)->entryArgIdxs.tes;
@@ -2339,8 +2333,8 @@ Value *PatchInOutImportExport::patchTesBuiltInInputImport(Type *inputTy, unsigne
 // @param builtInId : ID of the built-in variable
 // @param vertexIdx : Input array outermost index used for vertex indexing (could be null)
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchGsBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *vertexIdx,
-                                                         BuilderBase &builder) {
+Value *LowerInOut::patchGsBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *vertexIdx,
+                                             BuilderBase &builder) {
   Value *input = nullptr;
 
   auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Geometry)->entryArgIdxs.gs;
@@ -2394,7 +2388,7 @@ Value *PatchInOutImportExport::patchGsBuiltInInputImport(Type *inputTy, unsigned
 // @param inputTy : Type of input value
 // @param builtInId : ID of the built-in variable
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchMeshBuiltInInputImport(Type *inputTy, unsigned builtInId, BuilderBase &builder) {
+Value *LowerInOut::patchMeshBuiltInInputImport(Type *inputTy, unsigned builtInId, BuilderBase &builder) {
   // Handle work group size built-in
   if (builtInId == BuiltInWorkgroupSize) {
     // WorkgroupSize is a constant vector supplied by mesh shader mode.
@@ -2450,8 +2444,8 @@ Value *PatchInOutImportExport::patchMeshBuiltInInputImport(Type *inputTy, unsign
 // @param builtInId : ID of the built-in variable
 // @param generalVal : Sample ID, only needed for BuiltInSamplePosOffset; InterpLoc, only needed for BuiltInBaryCoord
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchFsBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *generalVal,
-                                                         BuilderBase &builder) {
+Value *LowerInOut::patchFsBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *generalVal,
+                                             BuilderBase &builder) {
   Value *input = PoisonValue::get(inputTy);
 
   const auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs;
@@ -2799,7 +2793,7 @@ Value *PatchInOutImportExport::patchFsBuiltInInputImport(Type *inputTy, unsigned
 // @param inputTy : Type of BuiltInSamplePosOffset
 // @param sampleId : Sample ID
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::getSamplePosOffset(Type *inputTy, Value *sampleId, BuilderBase &builder) {
+Value *LowerInOut::getSamplePosOffset(Type *inputTy, Value *sampleId, BuilderBase &builder) {
   // Gets the offset of sample position relative to the pixel center for the specified sample ID
   Value *numSamples = patchFsBuiltInInputImport(builder.getInt32Ty(), BuiltInNumSamples, nullptr, builder);
   Value *patternIdx = patchFsBuiltInInputImport(builder.getInt32Ty(), BuiltInSamplePatternIdx, nullptr, builder);
@@ -2820,7 +2814,7 @@ Value *PatchInOutImportExport::getSamplePosOffset(Type *inputTy, Value *sampleId
 //
 // @param inputTy : Type of BuiltInSamplePosition
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::getSamplePosition(Type *inputTy, BuilderBase &builder) {
+Value *LowerInOut::getSamplePosition(Type *inputTy, BuilderBase &builder) {
   Value *sampleId = patchFsBuiltInInputImport(builder.getInt32Ty(), BuiltInSampleId, nullptr, builder);
   Value *input = patchFsBuiltInInputImport(inputTy, BuiltInSamplePosOffset, sampleId, builder);
   return builder.CreateFAdd(input, ConstantFP::get(inputTy, 0.5));
@@ -2834,8 +2828,8 @@ Value *PatchInOutImportExport::getSamplePosition(Type *inputTy, BuilderBase &bui
 // @param elemIdx : Index used for array/vector element indexing (could be null)
 // @param vertexIdx : Output array outermost index used for vertex indexing (could be null)
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::patchTcsBuiltInOutputImport(Type *outputTy, unsigned builtInId, Value *elemIdx,
-                                                           Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::patchTcsBuiltInOutputImport(Type *outputTy, unsigned builtInId, Value *elemIdx, Value *vertexIdx,
+                                               BuilderBase &builder) {
   Value *output = PoisonValue::get(outputTy);
 
   const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl);
@@ -2938,7 +2932,7 @@ Value *PatchInOutImportExport::patchTcsBuiltInOutputImport(Type *outputTy, unsig
 // @param output : Output value
 // @param builtInId : ID of the built-in variable
 // @param builder : the builder to use
-void PatchInOutImportExport::patchVsBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) {
+void LowerInOut::patchVsBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) {
   auto outputTy = output->getType();
 
   const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex);
@@ -3104,8 +3098,8 @@ void PatchInOutImportExport::patchVsBuiltInOutputExport(Value *output, unsigned
 // @param elemIdx : Index used for array/vector element indexing (could be null)
 // @param vertexIdx : Output array outermost index used for vertex indexing (could be null)
 // @param builder : the builder to use
-void PatchInOutImportExport::patchTcsBuiltInOutputExport(Value *output, unsigned builtInId, Value *elemIdx,
-                                                         Value *vertexIdx, BuilderBase &builder) {
+void LowerInOut::patchTcsBuiltInOutputExport(Value *output, unsigned builtInId, Value *elemIdx, Value *vertexIdx,
+                                             BuilderBase &builder) {
   auto outputTy = output->getType();
 
   const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl);
@@ -3220,7 +3214,7 @@ void PatchInOutImportExport::patchTcsBuiltInOutputExport(Value *output, unsigned
 // @param output : Output value
 // @param builtInId : ID of the built-in variable
 // @param builder : the builder to use
-void PatchInOutImportExport::patchTesBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) {
+void LowerInOut::patchTesBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) {
   const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessEval);
   auto &builtInUsage = resUsage->builtInUsage.tes;
   auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap;
@@ -3335,8 +3329,8 @@ void PatchInOutImportExport::patchTesBuiltInOutputExport(Value *output, unsigned
 // @param builtInId : ID of the built-in variable
 // @param streamId : ID of output vertex stream
 // @param builder : the builder to use
-void PatchInOutImportExport::patchGsBuiltInOutputExport(Value *output, unsigned builtInId, unsigned streamId,
-                                                        BuilderBase &builder) {
+void LowerInOut::patchGsBuiltInOutputExport(Value *output, unsigned builtInId, unsigned streamId,
+                                            BuilderBase &builder) {
   if (streamId != m_pipelineState->getRasterizerState().rasterStream)
     return; // Skip built-in export if this stream is not the rasterization stream.
 
@@ -3390,9 +3384,8 @@ void PatchInOutImportExport::patchGsBuiltInOutputExport(Value *output, unsigned
 // @param vertexOrPrimitiveIdx : Output array outermost index used for vertex or primitive indexing
 // @param isPerPrimitive : Whether the output is per-primitive
 // @param builder : the builder to use
-void PatchInOutImportExport::patchMeshBuiltInOutputExport(Value *output, unsigned builtInId, Value *elemIdx,
-                                                          Value *vertexOrPrimitiveIdx, bool isPerPrimitive,
-                                                          BuilderBase &builder) {
+void LowerInOut::patchMeshBuiltInOutputExport(Value *output, unsigned builtInId, Value *elemIdx,
+                                              Value *vertexOrPrimitiveIdx, bool isPerPrimitive, BuilderBase &builder) {
   // Handle primitive indices built-ins
   if (builtInId == BuiltInPrimitivePointIndices || builtInId == BuiltInPrimitiveLineIndices ||
       builtInId == BuiltInPrimitiveTriangleIndices) {
@@ -3484,7 +3477,7 @@ void PatchInOutImportExport::patchMeshBuiltInOutputExport(Value *output, unsigne
 // @param output : Output value
 // @param builtInId : ID of the built-in variable
 // @param builder : the builder to use
-void PatchInOutImportExport::patchFsBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) {
+void LowerInOut::patchFsBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) {
   switch (builtInId) {
   case BuiltInFragDepth: {
     m_fragDepth = output;
@@ -3515,8 +3508,7 @@ void PatchInOutImportExport::patchFsBuiltInOutputExport(Value *output, unsigned
 // @param output : Output value
 // @param builtInId : ID of the built-in variable
 // @param builder : the builder to use
-void PatchInOutImportExport::patchCopyShaderBuiltInOutputExport(Value *output, unsigned builtInId,
-                                                                BuilderBase &builder) {
+void LowerInOut::patchCopyShaderBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) {
   switch (builtInId) {
   case BuiltInPosition:
   case BuiltInPointSize: {
@@ -3575,8 +3567,8 @@ void PatchInOutImportExport::patchCopyShaderBuiltInOutputExport(Value *output, u
 // @param xfbOffset : Transform feedback offset
 // @param streamId : Output stream ID
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::patchXfbOutputExport(Value *output, unsigned xfbBuffer, unsigned xfbOffset,
-                                                  unsigned streamId, BuilderBase &builder) {
+void LowerInOut::patchXfbOutputExport(Value *output, unsigned xfbBuffer, unsigned xfbOffset, unsigned streamId,
+                                      BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::CopyShader);
 
@@ -3640,9 +3632,9 @@ void PatchInOutImportExport::patchXfbOutputExport(Value *output, unsigned xfbBuf
 // @param bufBase : Buffer base offset
 // @param coherent : Buffer coherency
 // @param builder : The IR builder to create and insert IR instruction
-unsigned PatchInOutImportExport::combineBufferStore(const std::vector<Value *> &storeValues, unsigned startIdx,
-                                                    unsigned valueOffset, Value *bufDesc, Value *storeOffset,
-                                                    Value *bufBase, CoherentFlag coherent, BuilderBase &builder) {
+unsigned LowerInOut::combineBufferStore(const std::vector<Value *> &storeValues, unsigned startIdx,
+                                        unsigned valueOffset, Value *bufDesc, Value *storeOffset, Value *bufBase,
+                                        CoherentFlag coherent, BuilderBase &builder) {
   Type *storeTys[4] = {
       builder.getInt32Ty(),
       FixedVectorType::get(builder.getInt32Ty(), 2),
@@ -3696,9 +3688,8 @@ unsigned PatchInOutImportExport::combineBufferStore(const std::vector<Value *> &
 // @param bufBase : Buffer base offset
 // @param coherent : Buffer coherency
 // @param builder : The IR builder to create and insert IR instruction
-unsigned PatchInOutImportExport::combineBufferLoad(std::vector<Value *> &loadValues, unsigned startIdx, Value *bufDesc,
-                                                   Value *loadOffset, Value *bufBase, CoherentFlag coherent,
-                                                   BuilderBase &builder) {
+unsigned LowerInOut::combineBufferLoad(std::vector<Value *> &loadValues, unsigned startIdx, Value *bufDesc,
+                                       Value *loadOffset, Value *bufBase, CoherentFlag coherent, BuilderBase &builder) {
   Type *loadTyps[4] = {
       builder.getInt32Ty(),
       FixedVectorType::get(builder.getInt32Ty(), 2),
@@ -3748,8 +3739,8 @@ unsigned PatchInOutImportExport::combineBufferLoad(std::vector<Value *> &loadVal
 // @param xfbStride : Transform feedback stride
 // @param streamId : Output stream ID
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::storeValueToStreamOutBuffer(Value *storeValue, unsigned xfbBuffer, unsigned xfbOffset,
-                                                         unsigned xfbStride, unsigned streamId, BuilderBase &builder) {
+void LowerInOut::storeValueToStreamOutBuffer(Value *storeValue, unsigned xfbBuffer, unsigned xfbOffset,
+                                             unsigned xfbStride, unsigned streamId, BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::CopyShader);
   assert(xfbBuffer < MaxTransformFeedbackBuffers);
@@ -3869,8 +3860,7 @@ void PatchInOutImportExport::storeValueToStreamOutBuffer(Value *storeValue, unsi
 // @param location : Output location
 // @param compIdx : Output component index
 // @param builder : the builder to use
-void PatchInOutImportExport::storeValueToEsGsRing(Value *storeValue, unsigned location, unsigned compIdx,
-                                                  BuilderBase &builder) {
+void LowerInOut::storeValueToEsGsRing(Value *storeValue, unsigned location, unsigned compIdx, BuilderBase &builder) {
   auto storeTy = storeValue->getType();
 
   Type *elemTy = storeTy;
@@ -3937,8 +3927,8 @@ void PatchInOutImportExport::storeValueToEsGsRing(Value *storeValue, unsigned lo
 // @param compIdx : Input component index
 // @param vertexIdx : Vertex index
 // @param builder : the builder to use
-Value *PatchInOutImportExport::loadValueFromEsGsRing(Type *loadTy, unsigned location, unsigned compIdx,
-                                                     Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::loadValueFromEsGsRing(Type *loadTy, unsigned location, unsigned compIdx, Value *vertexIdx,
+                                         BuilderBase &builder) {
   Type *elemTy = loadTy;
   if (loadTy->isArrayTy())
     elemTy = cast<ArrayType>(loadTy)->getElementType();
@@ -3984,8 +3974,8 @@ Value *PatchInOutImportExport::loadValueFromEsGsRing(Type *loadTy, unsigned loca
 // @param compIdx : Output component index
 // @param streamId : Output stream ID
 // @param builder : the builder to use
-void PatchInOutImportExport::storeValueToGsVsRing(Value *storeValue, unsigned location, unsigned compIdx,
-                                                  unsigned streamId, BuilderBase &builder) {
+void LowerInOut::storeValueToGsVsRing(Value *storeValue, unsigned location, unsigned compIdx, unsigned streamId,
+                                      BuilderBase &builder) {
   auto storeTy = storeValue->getType();
 
   Type *elemTy = storeTy;
@@ -4105,8 +4095,8 @@ void PatchInOutImportExport::storeValueToGsVsRing(Value *storeValue, unsigned lo
 // @param compIdx : Output component index
 // @param esGsOffset : ES-GS ring offset in bytes
 // @param builder : the builder to use
-Value *PatchInOutImportExport::calcEsGsRingOffsetForOutput(unsigned location, unsigned compIdx, Value *esGsOffset,
-                                                           BuilderBase &builder) {
+Value *LowerInOut::calcEsGsRingOffsetForOutput(unsigned location, unsigned compIdx, Value *esGsOffset,
+                                               BuilderBase &builder) {
   // ES -> GS ring is always on-chip on GFX10+
   // ringOffset = esGsOffset + threadId * esGsRingItemSize + location * 4 + compIdx
   assert(m_pipelineState->hasShaderStage(ShaderStage::Geometry));
@@ -4126,8 +4116,8 @@ Value *PatchInOutImportExport::calcEsGsRingOffsetForOutput(unsigned location, un
 // @param compIdx : Input Component index
 // @param vertexIdx : Vertex index
 // @param builder : the builder to use
-Value *PatchInOutImportExport::calcEsGsRingOffsetForInput(unsigned location, unsigned compIdx, Value *vertexIdx,
-                                                          BuilderBase &builder) {
+Value *LowerInOut::calcEsGsRingOffsetForInput(unsigned location, unsigned compIdx, Value *vertexIdx,
+                                              BuilderBase &builder) {
   // ES -> GS ring is always on-chip on GFX10+
   assert(m_pipelineState->hasShaderStage(ShaderStage::Geometry));
   const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
@@ -4172,8 +4162,8 @@ Value *PatchInOutImportExport::calcEsGsRingOffsetForInput(unsigned location, uns
 // @param vertexIdx : Vertex index
 // @param gsVsOffset : ES-GS ring offset in bytes
 // @param builder : the builder to use
-Value *PatchInOutImportExport::calcGsVsRingOffsetForOutput(unsigned location, unsigned compIdx, unsigned streamId,
-                                                           Value *vertexIdx, Value *gsVsOffset, BuilderBase &builder) {
+Value *LowerInOut::calcGsVsRingOffsetForOutput(unsigned location, unsigned compIdx, unsigned streamId, Value *vertexIdx,
+                                               Value *gsVsOffset, BuilderBase &builder) {
   auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry);
 
   Value *ringOffset = nullptr;
@@ -4227,7 +4217,7 @@ Value *PatchInOutImportExport::calcGsVsRingOffsetForOutput(unsigned location, un
 // @param readTy : Type of value read from LDS
 // @param ldsOffset : Start offset to do LDS read operations
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::readValueFromLds(bool offChip, Type *readTy, Value *ldsOffset, BuilderBase &builder) {
+Value *LowerInOut::readValueFromLds(bool offChip, Type *readTy, Value *ldsOffset, BuilderBase &builder) {
   assert(readTy->isSingleValueType());
 
   // Read dwords from LDS
@@ -4309,7 +4299,7 @@ Value *PatchInOutImportExport::readValueFromLds(bool offChip, Type *readTy, Valu
 // @param writeValue : Value written to LDS
 // @param ldsOffset : Start offset to do LDS write operations
 // @param builder : The IR builder to create and insert IR instruction
-void PatchInOutImportExport::writeValueToLds(bool offChip, Value *writeValue, Value *ldsOffset, BuilderBase &builder) {
+void LowerInOut::writeValueToLds(bool offChip, Value *writeValue, Value *ldsOffset, BuilderBase &builder) {
   auto writeTy = writeValue->getType();
   assert(writeTy->isSingleValueType());
 
@@ -4375,8 +4365,7 @@ void PatchInOutImportExport::writeValueToLds(bool offChip, Value *writeValue, Va
 // @param location : Base location of the output
 // @param compIdx : Index used for vector element indexing
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::calcLdsOffsetForVsOutput(Type *outputTy, unsigned location, unsigned compIdx,
-                                                        BuilderBase &builder) {
+Value *LowerInOut::calcLdsOffsetForVsOutput(Type *outputTy, unsigned location, unsigned compIdx, BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Vertex);
 
   // attribOffset = location * 4 + compIdx
@@ -4414,8 +4403,8 @@ Value *PatchInOutImportExport::calcLdsOffsetForVsOutput(Type *outputTy, unsigned
 // @param compIdx : Index used for vector element indexing (could be null)
 // @param vertexIdx : Vertex indexing
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::calcLdsOffsetForTcsInput(Type *inputTy, unsigned location, Value *locOffset,
-                                                        Value *compIdx, Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::calcLdsOffsetForTcsInput(Type *inputTy, unsigned location, Value *locOffset, Value *compIdx,
+                                            Value *vertexIdx, BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::TessControl);
 
   const auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs;
@@ -4467,8 +4456,8 @@ Value *PatchInOutImportExport::calcLdsOffsetForTcsInput(Type *inputTy, unsigned
 // @param compIdx : Index used for vector element indexing (could be null)
 // @param vertexIdx : Vertex indexing
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::calcLdsOffsetForTcsOutput(Type *outputTy, unsigned location, Value *locOffset,
-                                                         Value *compIdx, Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::calcLdsOffsetForTcsOutput(Type *outputTy, unsigned location, Value *locOffset, Value *compIdx,
+                                             Value *vertexIdx, BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::TessControl);
 
   const auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs;
@@ -4537,8 +4526,8 @@ Value *PatchInOutImportExport::calcLdsOffsetForTcsOutput(Type *outputTy, unsigne
 // @param compIdx : Index used for vector element indexing (could be null)
 // @param vertexIdx : Vertex indexing
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::calcLdsOffsetForTesInput(Type *inputTy, unsigned location, Value *locOffset,
-                                                        Value *compIdx, Value *vertexIdx, BuilderBase &builder) {
+Value *LowerInOut::calcLdsOffsetForTesInput(Type *inputTy, unsigned location, Value *locOffset, Value *compIdx,
+                                            Value *vertexIdx, BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::TessEval);
 
   const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::TessControl)->inOutUsage.tcs.calcFactor;
@@ -4609,10 +4598,9 @@ Value *PatchInOutImportExport::calcLdsOffsetForTesInput(Type *inputTy, unsigned
 // @param outVertexStride : Vertex stride of output patch in (dwords)
 // @param patchConstCount : Count of output patch constants
 // @param tessFactorStride : Stride of tessellation factors (dwords)
-unsigned PatchInOutImportExport::calcPatchCountPerThreadGroup(unsigned inVertexCount, unsigned inVertexStride,
-                                                              unsigned outVertexCount, unsigned outVertexStride,
-                                                              unsigned patchConstCount,
-                                                              unsigned tessFactorStride) const {
+unsigned LowerInOut::calcPatchCountPerThreadGroup(unsigned inVertexCount, unsigned inVertexStride,
+                                                  unsigned outVertexCount, unsigned outVertexStride,
+                                                  unsigned patchConstCount, unsigned tessFactorStride) const {
   unsigned maxThreadCountPerThreadGroup = MaxHsThreadsPerSubgroup;
 
   // NOTE: If ray query uses LDS stack, the expected max thread count in the group is 64. And we force wave size
@@ -4695,8 +4683,8 @@ unsigned PatchInOutImportExport::calcPatchCountPerThreadGroup(unsigned inVertexC
 // @param location : Location of the output
 // @param compIdx : Index used for vector element indexing
 // @param builder : the builder to use
-void PatchInOutImportExport::addExportInstForGenericOutput(Value *output, unsigned location, unsigned compIdx,
-                                                           BuilderBase &builder) {
+void LowerInOut::addExportInstForGenericOutput(Value *output, unsigned location, unsigned compIdx,
+                                               BuilderBase &builder) {
   // Check if the shader stage is valid to use "exp" instruction to export output
   const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
   const bool useExpInst = ((m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
@@ -4791,7 +4779,7 @@ void PatchInOutImportExport::addExportInstForGenericOutput(Value *output, unsign
 // @param output : Output value
 // @param builtInId : ID of the built-in variable
 // @param builder : the builder to use
-void PatchInOutImportExport::addExportInstForBuiltInOutput(Value *output, unsigned builtInId, BuilderBase &builder) {
+void LowerInOut::addExportInstForBuiltInOutput(Value *output, unsigned builtInId, BuilderBase &builder) {
   const auto poison = PoisonValue::get(builder.getFloatTy());
 
   switch (builtInId) {
@@ -4866,7 +4854,7 @@ void PatchInOutImportExport::addExportInstForBuiltInOutput(Value *output, unsign
 // @param centroidIj : Centroid I/J provided by hardware natively
 // @param centerIj : Center I/J provided by hardware natively
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::adjustCentroidIj(Value *centroidIj, Value *centerIj, BuilderBase &builder) {
+Value *LowerInOut::adjustCentroidIj(Value *centroidIj, Value *centerIj, BuilderBase &builder) {
   auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs;
   auto primMask = getFunctionArgument(m_entryPoint, entryArgIdxs.primMask);
   auto &builtInUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment)->builtInUsage.fs;
@@ -4888,7 +4876,7 @@ Value *PatchInOutImportExport::adjustCentroidIj(Value *centroidIj, Value *center
 // Get Subgroup local invocation Id
 //
 // @param builder : The IR builder to create and insert IR instruction
-Value *PatchInOutImportExport::getSubgroupLocalInvocationId(BuilderBase &builder) {
+Value *LowerInOut::getSubgroupLocalInvocationId(BuilderBase &builder) {
   Value *subgroupLocalInvocationId =
       builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {builder.getInt32(-1), builder.getInt32(0)});
 
@@ -4904,7 +4892,7 @@ Value *PatchInOutImportExport::getSubgroupLocalInvocationId(BuilderBase &builder
 // =====================================================================================================================
 // Creates the LGC intrinsic "lgc.swizzle.thread.group" to swizzle thread group for optimization purposes.
 //
-void PatchInOutImportExport::createSwizzleThreadGroupFunction() {
+void LowerInOut::createSwizzleThreadGroupFunction() {
 
   // Generate IR instructions to swizzle thread groups with repeating N x N tiles of morton patterns. If the X or Y
   // dimensions are not divisible by N, thread groups along the right and bottom sections of the dispatch get row-major
@@ -5229,7 +5217,7 @@ void PatchInOutImportExport::createSwizzleThreadGroupFunction() {
 //
 // @param shadingRate : LGC shading rate
 // @param builder : the builder to use
-void PatchInOutImportExport::exportShadingRate(Value *shadingRate, BuilderBase &builder) {
+void LowerInOut::exportShadingRate(Value *shadingRate, BuilderBase &builder) {
   assert(m_gfxIp >= GfxIpVersion({10, 3})); // Must be GFX10.3+
 
   Value *hwShadingRate = nullptr;
@@ -5307,7 +5295,7 @@ void PatchInOutImportExport::exportShadingRate(Value *shadingRate, BuilderBase &
 
 // =====================================================================================================================
 // Gets HW primitive type from ancillary bits.
-Value *PatchInOutImportExport::getPrimType(BuilderBase &builder) {
+Value *LowerInOut::getPrimType(BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Fragment);
   auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs;
   auto ancillary = getFunctionArgument(m_entryPoint, entryArgIdxs.ancillary);
@@ -5320,7 +5308,7 @@ Value *PatchInOutImportExport::getPrimType(BuilderBase &builder) {
 // Gets HW line stipple value from lineStipple value.
 //
 // @param builder : the builder to use
-Value *PatchInOutImportExport::getLineStipple(BuilderBase &builder) {
+Value *LowerInOut::getLineStipple(BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Fragment);
   auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs;
   auto line_stipple = getFunctionArgument(m_entryPoint, entryArgIdxs.lineStipple);
@@ -5332,7 +5320,7 @@ Value *PatchInOutImportExport::getLineStipple(BuilderBase &builder) {
 // Gets HW shading rate and converts them to LGC definitions.
 //
 // @param builder : the builder to use
-Value *PatchInOutImportExport::getShadingRate(BuilderBase &builder) {
+Value *LowerInOut::getShadingRate(BuilderBase &builder) {
   assert(m_gfxIp >= GfxIpVersion({10, 3})); // Must be GFX10.3+
 
   assert(m_shaderStage == ShaderStage::Fragment);
@@ -5388,7 +5376,7 @@ Value *PatchInOutImportExport::getShadingRate(BuilderBase &builder) {
 //
 // @param location : Vertex attribute location
 // @param attribValues : Values of this vertex attribute to export
-void PatchInOutImportExport::recordVertexAttribExport(unsigned location, ArrayRef<Value *> attribValues) {
+void LowerInOut::recordVertexAttribExport(unsigned location, ArrayRef<Value *> attribValues) {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::CopyShader); // Valid shader stages
   assert(location <= MaxInOutLocCount);             // 32 attributes at most
@@ -5427,7 +5415,7 @@ void PatchInOutImportExport::recordVertexAttribExport(unsigned location, ArrayRe
 // Exports vertex attributes that were recorded previously
 //
 // @param builder : the builder to use
-void PatchInOutImportExport::exportVertexAttribs(BuilderBase &builder) {
+void LowerInOut::exportVertexAttribs(BuilderBase &builder) {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::CopyShader); // Valid shader stages
   if (m_attribExports.empty()) {
@@ -5458,12 +5446,9 @@ void PatchInOutImportExport::exportVertexAttribs(BuilderBase &builder) {
       for (unsigned i = 0; i < 4; ++i)
         attribValue = builder.CreateInsertElement(attribValue, attribExport.second[i], i);
       // NOTE: Create a call if we export vertex attribute through memory. This call will be expanded when NGG primitive
-      // shader is generated. The arguments are: buffer descriptor of attribute ring, attribute location, and attribute
-      // export value.
+      // shader is generated. The arguments are: attribute location, and attribute export value.
       builder.CreateNamedCall(lgcName::NggAttributeThroughMemory, builder.getVoidTy(),
-                              {m_pipelineSysValues.get(m_entryPoint)->getAttribRingBufDesc(),
-                               builder.getInt32(attribExport.first), attribValue},
-                              {});
+                              {builder.getInt32(attribExport.first), attribValue}, {});
     }
   }
 }
diff --git a/lgc/patch/LowerMulDx9Zero.cpp b/lgc/patch/LowerMulDx9Zero.cpp
index fa9495121d..af58aaef49 100644
--- a/lgc/patch/LowerMulDx9Zero.cpp
+++ b/lgc/patch/LowerMulDx9Zero.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerMulDx9Zero.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchMulDx9Zero.
+ * @brief LLPC source file: contains implementation of class lgc::LowerMulDx9Zero.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/LowerMulDx9Zero.h"
@@ -38,7 +38,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-#define DEBUG_TYPE "lgc-patch-mul-dx9-zero"
+#define DEBUG_TYPE "lgc-lower-mul-dx9-zero"
 
 using namespace lgc;
 using namespace llvm;
@@ -46,7 +46,7 @@ using namespace PatternMatch;
 
 namespace lgc {
 // =====================================================================================================================
-PatchMulDx9Zero::PatchMulDx9Zero() : m_changed(false) {
+LowerMulDx9Zero::LowerMulDx9Zero() : m_changed(false) {
 }
 
 // =====================================================================================================================
@@ -58,8 +58,8 @@ PatchMulDx9Zero::PatchMulDx9Zero() : m_changed(false) {
 // fma((b==0.0 ? 0.0 : a), (a==0.0 ? 0.0 : b), c)
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchMulDx9Zero::run(Function &function, FunctionAnalysisManager &analysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Mul-Dx9Zero-Opt\n");
+PreservedAnalyses LowerMulDx9Zero::run(Function &function, FunctionAnalysisManager &analysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the pass Lower-Mul-Dx9Zero-Opt\n");
 
   m_builder = std::make_unique<IRBuilder<>>(function.getContext());
 
@@ -72,7 +72,7 @@ PreservedAnalyses PatchMulDx9Zero::run(Function &function, FunctionAnalysisManag
 // Visits call instruction.
 //
 // @param callInst : Call instruction
-void PatchMulDx9Zero::visitCallInst(CallInst &callInst) {
+void LowerMulDx9Zero::visitCallInst(CallInst &callInst) {
   auto callee = callInst.getCalledFunction();
   if (!callee)
     return;
@@ -103,7 +103,7 @@ void PatchMulDx9Zero::visitCallInst(CallInst &callInst) {
 // Visits binary operator instruction.
 //
 // @param binaryOp : Binary operator instruction
-void PatchMulDx9Zero::visitBinaryOperator(BinaryOperator &binaryOp) {
+void LowerMulDx9Zero::visitBinaryOperator(BinaryOperator &binaryOp) {
   Instruction::BinaryOps opCode = binaryOp.getOpcode();
 
   // Replace mul with amdgcn_fmul_legacy intrinsic when detect patterns like:
@@ -132,7 +132,7 @@ void PatchMulDx9Zero::visitBinaryOperator(BinaryOperator &binaryOp) {
 // with DX9 zero semantics. If so, returns a pair of operands for the new multiply.
 // @param lhs : left operand for the operation
 // @param rhs:  right operand for the operation
-std::optional<std::pair<Value *, Value *>> PatchMulDx9Zero::isMulDx9Zero(Value *lhs, Value *rhs) {
+std::optional<std::pair<Value *, Value *>> LowerMulDx9Zero::isMulDx9Zero(Value *lhs, Value *rhs) {
   Value *lhsCmpValue = nullptr;
   Value *lhsFalseValue = nullptr;
   Value *rhsCmpValue = nullptr;
diff --git a/lgc/patch/PatchReadFirstLane.cpp b/lgc/patch/LowerReadFirstLane.cpp
similarity index 96%
rename from lgc/patch/PatchReadFirstLane.cpp
rename to lgc/patch/LowerReadFirstLane.cpp
index 5b40e1fdbc..b598806edc 100644
--- a/lgc/patch/PatchReadFirstLane.cpp
+++ b/lgc/patch/LowerReadFirstLane.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchReadFirstLane.cpp
+ * @file  LowerReadFirstLane.cpp
  * @brief LLPC source file: contains declaration and implementation of class lgc::PatchReadFirstLane.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchReadFirstLane.h"
+#include "lgc/patch/LowerReadFirstLane.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/util/BuilderBase.h"
@@ -79,10 +79,6 @@ class ReadFirstLaneOptimizer {
   bool isAllUsersAssumedUniform(Instruction *inst);
   void applyReadFirstLane(Instruction *inst, BuilderBase &builder);
 
-  // We only support to apply amdgcn_readfirstlane on float or int type
-  // TODO: Support various types when backend work is ready
-  bool isSupportedType(Instruction *inst) { return inst->getType()->isFloatTy() || inst->getType()->isIntegerTy(32); }
-
   UniformityInfo &m_uniformityInfo;
   TargetTransformInfo &m_targetTransformInfo;
 
@@ -486,7 +482,7 @@ void ReadFirstLaneOptimizer::findBestInsertLocation(const SmallVectorImpl<Instru
 
       enforcedUniformTracker.push_back(current);
 
-      if (isSupportedType(current)) {
+      if (isReadFirstLaneTypeSupported(current->getType())) {
         bestInsertLocation = current;
         bestInsertLocationDepth = enforcedUniformTracker.size();
       }
@@ -530,20 +526,9 @@ void ReadFirstLaneOptimizer::applyReadFirstLane(Instruction *inst, BuilderBase &
   builder.SetInsertPoint(insertPos);
 
   Type *instTy = inst->getType();
-  const bool isFloat = instTy->isFloatTy();
-  assert(isFloat || instTy->isIntegerTy(32));
-  Value *newInst = inst;
-  if (isFloat)
-    newInst = builder.CreateBitCast(inst, builder.getInt32Ty());
-
-  Value *readFirstLane = builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, newInst);
-
-  Value *replaceInst = nullptr;
-  if (isFloat) {
-    replaceInst = builder.CreateBitCast(readFirstLane, instTy);
-  } else {
-    newInst = readFirstLane;
-    replaceInst = readFirstLane;
-  }
-  inst->replaceUsesWithIf(replaceInst, [newInst](Use &U) { return U.getUser() != newInst; });
+  assert(isReadFirstLaneTypeSupported(instTy));
+
+  Value *readFirstLane = builder.CreateIntrinsic(instTy, Intrinsic::amdgcn_readfirstlane, inst);
+
+  inst->replaceUsesWithIf(readFirstLane, [readFirstLane](Use &U) { return U.getUser() != readFirstLane; });
 }
diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp
index c289b82595..85577e675d 100644
--- a/lgc/patch/MeshTaskShader.cpp
+++ b/lgc/patch/MeshTaskShader.cpp
@@ -30,6 +30,7 @@
  */
 #include "MeshTaskShader.h"
 #include "ShaderMerger.h"
+#include "lgc/patch/MutateEntryPoint.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/util/Debug.h"
 #include "lgc/util/WorkgroupLayout.h"
@@ -432,7 +433,7 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct
 
   unsigned sharedVarLdsSizeInDwords = 0;
   for (auto meshSharedVar : meshSharedVars) {
-    assert(meshSharedVar->getAlignment() == 4); // Must be 1 dword
+    assert(meshSharedVar->getAlignment() % 4 == 0); // Must be multiple of 1 dword
     const auto sizeInBytes =
         meshSharedVar->getParent()->getDataLayout().getTypeAllocSize(meshSharedVar->getValueType());
     assert(sizeInBytes % 4 == 0); // Must be multiple of 4
@@ -508,7 +509,7 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct
     if (!meshSharedVars.empty()) {
       LLPC_OUTS("Shared Variables:\n");
       for (auto meshSharedVar : meshSharedVars) {
-        assert(meshSharedVar->getAlignment() == 4); // Must be 1 dword
+        assert(meshSharedVar->getAlignment() % 4 == 0); // Must be multiple of 1 dword
         const auto sizeInBytes =
             meshSharedVar->getParent()->getDataLayout().getTypeAllocSize(meshSharedVar->getValueType());
         assert(sizeInBytes % 4 == 0); // Must be multiple of 4
@@ -602,6 +603,7 @@ void MeshTaskShader::processTaskShader(Function *entryPoint) {
 
   static auto visitor = llvm_dialects::VisitorBuilder<MeshTaskShader>()
                             .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                            .add<GroupMemcpyOp>(&MeshTaskShader::lowerGroupMemcpy)
                             .add<TaskPayloadPtrOp>(&MeshTaskShader::lowerTaskPayloadPtr)
                             .add<EmitMeshTasksOp>(&MeshTaskShader::lowerEmitMeshTasks)
                             .build();
@@ -1174,6 +1176,46 @@ void MeshTaskShader::processMeshShader(Function *entryPoint) {
   updateMeshShaderInOutUsage();
 }
 
+// =====================================================================================================================
+// Lower GroupMemcpyOp - copy memory using all threads in a workgroup.
+//
+// @param groupMemcpyOp : Call instruction to do group memory copy
+void MeshTaskShader::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) {
+  Function *entryPoint = groupMemcpyOp.getFunction();
+  auto stage = getShaderStage(entryPoint);
+  m_builder.SetInsertPoint(&groupMemcpyOp);
+
+  unsigned scopeSize = 0;
+  Value *threadIndex = nullptr;
+
+  auto scope = groupMemcpyOp.getScope();
+  if (scope == MemcpyScopeWorkGroup) {
+    unsigned workgroupSize[3] = {};
+    auto shaderModes = m_pipelineState->getShaderModes();
+    if (stage == ShaderStage::Task) {
+      Module &module = *groupMemcpyOp.getModule();
+      workgroupSize[0] = shaderModes->getComputeShaderMode(module).workgroupSizeX;
+      workgroupSize[1] = shaderModes->getComputeShaderMode(module).workgroupSizeY;
+      workgroupSize[2] = shaderModes->getComputeShaderMode(module).workgroupSizeZ;
+    } else if (stage == ShaderStage::Mesh) {
+      workgroupSize[0] = shaderModes->getMeshShaderMode().workgroupSizeX;
+      workgroupSize[1] = shaderModes->getMeshShaderMode().workgroupSizeY;
+      workgroupSize[2] = shaderModes->getMeshShaderMode().workgroupSizeZ;
+    } else {
+      llvm_unreachable("Invalid shade stage!");
+    }
+
+    scopeSize = workgroupSize[0] * workgroupSize[1] * workgroupSize[2];
+    threadIndex = m_waveThreadInfo.threadIdInSubgroup;
+  } else {
+    llvm_unreachable("Unsupported scope!");
+  }
+
+  MutateEntryPoint::processGroupMemcpy(groupMemcpyOp, m_builder, threadIndex, scopeSize);
+
+  m_callsToRemove.push_back(&groupMemcpyOp);
+}
+
 // =====================================================================================================================
 // Lower task payload pointer to buffer fat pointer.
 //
diff --git a/lgc/patch/MeshTaskShader.h b/lgc/patch/MeshTaskShader.h
index 35075fdf07..d007b081ce 100644
--- a/lgc/patch/MeshTaskShader.h
+++ b/lgc/patch/MeshTaskShader.h
@@ -31,7 +31,7 @@
 #pragma once
 
 #include "lgc/LgcDialect.h"
-#include "lgc/patch/PatchPreparePipelineAbi.h"
+#include "lgc/patch/PreparePipelineAbi.h"
 #include "lgc/patch/SystemValues.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
@@ -98,6 +98,7 @@ class MeshTaskShader {
 
   void processTaskShader(llvm::Function *entryPoint);
   void processMeshShader(llvm::Function *entryPoint);
+  void lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp);
   void lowerTaskPayloadPtr(TaskPayloadPtrOp &taskPayloadPtrOp);
   void lowerEmitMeshTasks(EmitMeshTasksOp &emitMeshTasksOp);
   void lowerSetMeshOutputs(SetMeshOutputsOp &setMeshOutputsOp);
diff --git a/lgc/patch/MutateEntryPoint.cpp b/lgc/patch/MutateEntryPoint.cpp
index f69f32b56a..5766b985d0 100644
--- a/lgc/patch/MutateEntryPoint.cpp
+++ b/lgc/patch/MutateEntryPoint.cpp
@@ -59,7 +59,6 @@
 #include "lgc/LgcContext.h"
 #include "lgc/LgcCpsDialect.h"
 #include "lgc/LgcDialect.h"
-#include "lgc/builder/BuilderImpl.h"
 #include "lgc/patch/ShaderInputs.h"
 #include "lgc/patch/SystemValues.h"
 #include "lgc/state/AbiMetadata.h"
@@ -87,9 +86,12 @@ using namespace lgc;
 using namespace cps;
 
 // =====================================================================================================================
-MutateEntryPoint::MutateEntryPoint()
-    : m_hasTs(false), m_hasGs(false),
-      m_setInactiveChainArgId(Function::lookupIntrinsicID("llvm.amdgcn.set.inactive.chain.arg")) {
+MutateEntryPoint::MutateEntryPoint() : m_hasTs(false), m_hasGs(false) {
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 513481
+  m_setInactiveChainArgId = Function::lookupIntrinsicID("llvm.amdgcn.set.inactive.chain.arg");
+#else
+  m_setInactiveChainArgId = Intrinsic::lookupIntrinsicID("llvm.amdgcn.set.inactive.chain.arg");
+#endif
 }
 
 // =====================================================================================================================
@@ -163,7 +165,9 @@ PreservedAnalyses MutateEntryPoint::run(Module &module, ModuleAnalysisManager &a
 
   m_cpsShaderInputCache.clear();
 
-  processGroupMemcpy(module);
+  if (!m_pipelineState->isGraphics())
+    processCsGroupMemcpy(module);
+
   processDriverTableLoad(module);
 
   return PreservedAnalyses::none();
@@ -304,7 +308,7 @@ void MutateEntryPoint::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDriverTa
 // Process GroupMemcpyOp.
 //
 // @param module : LLVM module
-void MutateEntryPoint::processGroupMemcpy(Module &module) {
+void MutateEntryPoint::processCsGroupMemcpy(Module &module) {
   SmallVector<CallInst *> callsToRemove;
 
   struct Payload {
@@ -317,7 +321,7 @@ void MutateEntryPoint::processGroupMemcpy(Module &module) {
   static auto visitor = llvm_dialects::VisitorBuilder<Payload>()
                             .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
                             .add<GroupMemcpyOp>([](auto &payload, auto &op) {
-                              payload.self->lowerGroupMemcpy(op);
+                              payload.self->lowerCsGroupMemcpy(op);
                               payload.callsToRemove.push_back(&op);
                             })
                             .build();
@@ -331,127 +335,80 @@ void MutateEntryPoint::processGroupMemcpy(Module &module) {
 // Lower GroupMemcpyOp - Copy memory using threads in a workgroup (scope=2) or subgroup (scope=3).
 //
 // @param groupMemcpyOp : Call instruction to do group memory copy
-void MutateEntryPoint::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) {
-  BuilderImpl builder(m_pipelineState);
+void MutateEntryPoint::lowerCsGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) {
+  BuilderBase builder(groupMemcpyOp.getContext());
   Function *entryPoint = groupMemcpyOp.getFunction();
-  auto stage = getShaderStage(entryPoint);
-  builder.setShaderStage(stage);
   builder.SetInsertPoint(&groupMemcpyOp);
 
-  auto gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion();
-
-  auto dst = groupMemcpyOp.getDst();
-  auto src = groupMemcpyOp.getSrc();
-  auto len = groupMemcpyOp.getSize();
-  auto scope = groupMemcpyOp.getScope();
-
   unsigned scopeSize = 0;
   Value *threadIndex = nullptr;
 
-  if (scope == 2) {
+  auto scope = groupMemcpyOp.getScope();
+  if (scope == MemcpyScopeWorkGroup) {
     unsigned workgroupSize[3] = {};
     auto shaderModes = m_pipelineState->getShaderModes();
-    if (stage == ShaderStage::Task || stage == ShaderStage::Compute) {
-      Module &module = *groupMemcpyOp.getModule();
-      workgroupSize[0] = shaderModes->getComputeShaderMode(module).workgroupSizeX;
-      workgroupSize[1] = shaderModes->getComputeShaderMode(module).workgroupSizeY;
-      workgroupSize[2] = shaderModes->getComputeShaderMode(module).workgroupSizeZ;
-    } else if (stage == ShaderStage::Mesh) {
-      workgroupSize[0] = shaderModes->getMeshShaderMode().workgroupSizeX;
-      workgroupSize[1] = shaderModes->getMeshShaderMode().workgroupSizeY;
-      workgroupSize[2] = shaderModes->getMeshShaderMode().workgroupSizeZ;
-    } else {
-      llvm_unreachable("Invalid shade stage!");
-    }
+    assert(getShaderStage(entryPoint) == ShaderStage::Compute);
 
-    // LocalInvocationId is a function argument now and CreateReadBuiltInInput cannot retrieve it.
-    unsigned argIndex = 0xFFFFFFFF;
-    switch (stage.value()) {
-    case ShaderStage::Task: {
-      auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Task)->entryArgIdxs.task;
-      argIndex = entryArgIdxs.localInvocationId;
-      break;
-    }
-    case ShaderStage::Mesh: {
-      auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Mesh)->entryArgIdxs.mesh;
-      argIndex = entryArgIdxs.localInvocationId;
-      break;
-    }
-    case ShaderStage::Compute: {
-      auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Compute)->entryArgIdxs.cs;
-      argIndex = entryArgIdxs.localInvocationId;
-      break;
-    }
-    default:
-      llvm_unreachable("Invalid shade stage!");
-      break;
-    }
-
-    const unsigned waveSize = m_pipelineState->getShaderWaveSize(stage.value());
-
-    // For mesh shader the following two ids are required.
-    Value *waveIdInSubgroupMesh = nullptr;
-    Value *threadIdInWaveMesh = nullptr;
-    if (stage == ShaderStage::Mesh) {
-      builder.CreateIntrinsic(Intrinsic::amdgcn_init_exec, {}, builder.getInt64(-1));
-      // waveId = mergedWaveInfo[27:24]
-      Value *mergedWaveInfo =
-          getFunctionArgument(entryPoint, ShaderMerger::getSpecialSgprInputIndex(gfxIp, EsGs::MergedWaveInfo));
-      waveIdInSubgroupMesh = builder.CreateAnd(builder.CreateLShr(mergedWaveInfo, 24), 0xF, "waveIdInSubgroupMesh");
-
-      threadIdInWaveMesh =
-          builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {builder.getInt32(-1), builder.getInt32(0)});
-      if (waveSize == 64) {
-        threadIdInWaveMesh =
-            builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {builder.getInt32(-1), threadIdInWaveMesh});
-      }
-      threadIdInWaveMesh->setName("threadIdInWaveMesh");
-    }
+    Module &module = *groupMemcpyOp.getModule();
+    workgroupSize[0] = shaderModes->getComputeShaderMode(module).workgroupSizeX;
+    workgroupSize[1] = shaderModes->getComputeShaderMode(module).workgroupSizeY;
+    workgroupSize[2] = shaderModes->getComputeShaderMode(module).workgroupSizeZ;
 
-    unsigned workgroupTotalSize = workgroupSize[0] * workgroupSize[1] * workgroupSize[2];
+    scopeSize = workgroupSize[0] * workgroupSize[1] * workgroupSize[2];
 
-    scopeSize = workgroupTotalSize;
+    auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Compute)->entryArgIdxs.cs;
+    Value *threadIdInGroup = getFunctionArgument(entryPoint, entryArgIdxs.localInvocationId);
+    Value *threadIdComp[3];
 
-    // localInvocationId argument for mesh shader is available from GFX11+. But it can be retrieved in anther way.
-    if (stage == ShaderStage::Mesh) {
-      threadIndex = builder.CreateAdd(builder.CreateMul(waveIdInSubgroupMesh, builder.getInt32(waveSize)),
-                                      threadIdInWaveMesh, "threadIdInSubgroupMesh");
+    auto gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion();
+    if (gfxIp.major < 11) {
+      for (unsigned idx = 0; idx < 3; idx++)
+        threadIdComp[idx] = builder.CreateExtractElement(threadIdInGroup, idx);
     } else {
-      Value *threadIdInGroup = getFunctionArgument(entryPoint, argIndex);
-      Value *threadIdComp[3];
-      if (gfxIp.major < 11) {
-        for (unsigned idx = 0; idx < 3; idx++)
-          threadIdComp[idx] = builder.CreateExtractElement(threadIdInGroup, idx);
-      } else {
-        // The local invocation ID is packed to VGPR0 on GFX11+ with the following layout:
-        //
-        //   +-----------------------+-----------------------+-----------------------+
-        //   | Local Invocation ID Z | Local Invocation ID Y | Local Invocation ID X |
-        //   | [29:20]               | [19:10]               | [9:0]                 |
-        //   +-----------------------+-----------------------+-----------------------+
-        // localInvocationIdZ = localInvocationId[29:20]
-        threadIdComp[2] = builder.CreateAnd(builder.CreateLShr(threadIdInGroup, 20), 0x3FF, "localInvocationIdZ");
-        // localInvocationIdY = localInvocationId[19:10]
-        threadIdComp[1] = builder.CreateAnd(builder.CreateLShr(threadIdInGroup, 10), 0x3FF, "localInvocationIdY");
-        // localInvocationIdX = localInvocationId[9:0]
-        threadIdComp[0] = builder.CreateAnd(threadIdInGroup, 0x3FF, "localInvocationIdX");
-      }
-
-      // LocalInvocationIndex is
-      // (LocalInvocationId.Z * WorkgroupSize.Y + LocalInvocationId.Y) * WorkGroupSize.X + LocalInvocationId.X
-      // tidigCompCnt is not always 3 if groupSizeY and/or groupSizeZ are 1. See RegisterMetadataBuilder.cpp.
-      threadIndex = builder.getInt32(0);
-      if (workgroupSize[2] > 1)
-        threadIndex = builder.CreateMul(threadIdComp[2], builder.getInt32(workgroupSize[1]));
-      if (workgroupSize[1] > 1)
-        threadIndex =
-            builder.CreateMul(builder.CreateAdd(threadIndex, threadIdComp[1]), builder.getInt32(workgroupSize[0]));
-      threadIndex = builder.CreateAdd(threadIndex, threadIdComp[0]);
+      // The local invocation ID is packed to VGPR0 on GFX11+ with the following layout:
+      //
+      //   +-----------------------+-----------------------+-----------------------+
+      //   | Local Invocation ID Z | Local Invocation ID Y | Local Invocation ID X |
+      //   | [29:20]               | [19:10]               | [9:0]                 |
+      //   +-----------------------+-----------------------+-----------------------+
+      // localInvocationIdZ = localInvocationId[29:20]
+      threadIdComp[2] = builder.CreateAnd(builder.CreateLShr(threadIdInGroup, 20), 0x3FF, "localInvocationIdZ");
+      // localInvocationIdY = localInvocationId[19:10]
+      threadIdComp[1] = builder.CreateAnd(builder.CreateLShr(threadIdInGroup, 10), 0x3FF, "localInvocationIdY");
+      // localInvocationIdX = localInvocationId[9:0]
+      threadIdComp[0] = builder.CreateAnd(threadIdInGroup, 0x3FF, "localInvocationIdX");
     }
+
+    // LocalInvocationIndex is
+    // (LocalInvocationId.Z * WorkgroupSize.Y + LocalInvocationId.Y) * WorkGroupSize.X + LocalInvocationId.X
+    // tidigCompCnt is not always set to 2(xyz) if groupSizeY and/or groupSizeZ are 1. See RegisterMetadataBuilder.cpp.
+    threadIndex = builder.getInt32(0);
+    if (workgroupSize[2] > 1)
+      threadIndex = builder.CreateMul(threadIdComp[2], builder.getInt32(workgroupSize[1]));
+    if (workgroupSize[1] > 1)
+      threadIndex =
+          builder.CreateMul(builder.CreateAdd(threadIndex, threadIdComp[1]), builder.getInt32(workgroupSize[0]));
+    threadIndex = builder.CreateAdd(threadIndex, threadIdComp[0]);
   } else {
     llvm_unreachable("Unsupported scope!");
   }
 
+  processGroupMemcpy(groupMemcpyOp, builder, threadIndex, scopeSize);
+}
+
+// =====================================================================================================================
+// Common code to do the memory copy part of GroupMemcpyOp, used by MeshTaskShader and PatchEntryPointMutate.
+//
+// @param groupMemcpyOp : Call instruction to do group memory copy
+// @param builder : The IR builder for inserting instructions
+// @param threadIndex : Current thread index
+// @param scopeSize : The copy size in bytes for specified scope (currently workgroup only, maybe subgroup).
+void MutateEntryPoint::processGroupMemcpy(GroupMemcpyOp &groupMemcpyOp, BuilderBase &builder, Value *threadIndex,
+                                          unsigned scopeSize) {
+  auto dst = groupMemcpyOp.getDst();
+  auto src = groupMemcpyOp.getSrc();
+  auto len = groupMemcpyOp.getSize();
+
   // Copy in 16-bytes if possible
   unsigned wideDwords = 4;
   // If either pointer is in LDS, copy in 8-bytes
@@ -914,7 +871,7 @@ unsigned MutateEntryPoint::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, B
                                         SmallVectorImpl<CpsExitInfo> &exitInfos) {
   IRBuilder<> builder(parent->getContext());
   const DataLayout &layout = parent->getParent()->getDataLayout();
-  // Translate @lgc.cps.jump(CR %target, i32 %levels, T %state, ...) into:
+  // Translate @lgc.cps.jump(CR %target, i32 %levels, T %state, i32 %csp, ...) into:
   // @llvm.amdgcn.cs.chain(ptr %fn, i{32,64} %exec, T %sgprs, U %vgprs, i32 immarg %flags, ...)
   Value *vcr = jumpOp->getTarget();
   builder.SetInsertPoint(jumpOp);
@@ -934,7 +891,7 @@ unsigned MutateEntryPoint::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, B
 
   // Add extra args specific to the target function.
   SmallVector<Value *> remainingArgs;
-  for (Value *arg : drop_begin(jumpOp->args(), 3))
+  for (Value *arg : drop_begin(jumpOp->args(), 4))
     remainingArgs.push_back(arg);
 
   // Packing VGPR arguments {vcr, vsp, args...}
diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp
index 73207cd6b0..795d0c15ca 100644
--- a/lgc/patch/NggPrimShader.cpp
+++ b/lgc/patch/NggPrimShader.cpp
@@ -34,6 +34,7 @@
 #include "lgc/state/PalMetadata.h"
 #include "lgc/util/Debug.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -549,6 +550,7 @@ Function *NggPrimShader::generate(Function *esMain, Function *gsMain, Function *
 
   // Assign names to ES, GS and copy shader main functions
   Module *module = nullptr;
+  bool createDbgInfo = false;
   if (esMain) {
     module = esMain->getParent();
 
@@ -558,6 +560,7 @@ Function *NggPrimShader::generate(Function *esMain, Function *gsMain, Function *
     esMain->setDLLStorageClass(GlobalValue::DefaultStorageClass);
     esMain->addFnAttr(Attribute::AlwaysInline);
     m_esHandlers.main = esMain;
+    createDbgInfo |= esMain->getSubprogram() != nullptr;
   }
 
   if (gsMain) {
@@ -569,6 +572,7 @@ Function *NggPrimShader::generate(Function *esMain, Function *gsMain, Function *
     gsMain->setDLLStorageClass(GlobalValue::DefaultStorageClass);
     gsMain->addFnAttr(Attribute::AlwaysInline);
     m_gsHandlers.main = gsMain;
+    createDbgInfo |= gsMain->getSubprogram() != nullptr;
 
     assert(copyShader); // Copy shader must be present
     copyShader->setName(NggCopyShader);
@@ -583,8 +587,8 @@ Function *NggPrimShader::generate(Function *esMain, Function *gsMain, Function *
   uint64_t inRegMask = 0;
   auto primShaderTy = getPrimShaderType(inRegMask);
 
-  Function *primShader =
-      createFunctionHelper(primShaderTy, GlobalValue::ExternalLinkage, module, lgcName::NggPrimShaderEntryPoint);
+  Function *primShader = createFunctionHelper(primShaderTy, GlobalValue::ExternalLinkage, module, createDbgInfo,
+                                              lgcName::NggPrimShaderEntryPoint);
   primShader->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
   const unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Geometry);
   primShader->addFnAttr("target-features", ",+wavefrontsize" + std::to_string(waveSize)); // Set wavefront size
@@ -1018,8 +1022,8 @@ void NggPrimShader::buildPassthroughPrimShader(Function *primShader) {
     initWaveThreadInfo(mergedGroupInfo, mergedWaveInfo);
 
     if (m_gfxIp.major >= 11) {
-      // Record attribute ring base ([14:0])
-      m_nggInputs.attribRingBase = createUBfe(attribRingBase, 0, 15);
+      if (!m_pipelineState->exportAttributeByExportInstruction())
+        prepareAttribRingAccess(userData);
 
       if (m_pipelineState->enableSwXfb() || m_pipelineState->enablePrimStats())
         loadStreamOutBufferInfo(userData);
@@ -1344,8 +1348,8 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
     initWaveThreadInfo(mergedGroupInfo, mergedWaveInfo);
 
     if (m_gfxIp.major >= 11) {
-      // Record attribute ring base ([14:0])
-      m_nggInputs.attribRingBase = createUBfe(attribRingBase, 0, 15);
+      if (!m_pipelineState->exportAttributeByExportInstruction())
+        prepareAttribRingAccess(userData);
 
       if (m_pipelineState->enableSwXfb() || m_pipelineState->enablePrimStats())
         loadStreamOutBufferInfo(userData);
@@ -2042,8 +2046,8 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     initWaveThreadInfo(mergedGroupInfo, mergedWaveInfo);
 
     if (m_gfxIp.major >= 11) {
-      // Record attribute ring base ([14:0])
-      m_nggInputs.attribRingBase = createUBfe(attribRingBase, 0, 15);
+      if (!m_pipelineState->exportAttributeByExportInstruction())
+        prepareAttribRingAccess(userData);
 
       if (m_pipelineState->enableSwXfb() || m_pipelineState->enablePrimStats())
         loadStreamOutBufferInfo(userData);
@@ -2530,6 +2534,54 @@ void NggPrimShader::initWaveThreadInfo(Value *mergedGroupInfo, Value *mergedWave
   m_nggInputs.orderedWaveId = orderedWaveId;
 }
 
+// =====================================================================================================================
+// Prepare attribute ring access by collecting attribute count, modifying the STRIDE field of attribute ring buffer
+// descriptor, and calculating subgroup's attribute ring base offset.
+//
+// @param userData : User data
+void NggPrimShader::prepareAttribRingAccess(Value *userData) {
+  assert(m_gfxIp.major >= 11);                                    // For GFX11+
+  assert(!m_pipelineState->exportAttributeByExportInstruction()); // ATM is allowed
+
+  ShaderStageEnum shaderStage =
+      m_hasGs ? ShaderStage::Geometry : (m_hasTes ? ShaderStage::TessEval : ShaderStage::Vertex);
+  const unsigned attribCount = m_pipelineState->getShaderResourceUsage(shaderStage)->inOutUsage.expCount;
+  if (attribCount == 0)
+    return; // No vertex attribute exports
+
+  // attribRingBase[14:0]
+  auto entryPoint = m_builder.GetInsertBlock()->getParent();
+  Value *attribRingBase =
+      getFunctionArgument(entryPoint, ShaderMerger::getSpecialSgprInputIndex(m_gfxIp, EsGs::AttribRingBase));
+  attribRingBase = m_builder.CreateAnd(attribRingBase, 0x7FFF);
+
+  static const unsigned AttribGranularity = 32 * SizeOfVec4; // 32 * 16 bytes
+  m_attribRingBaseOffset =
+      m_builder.CreateMul(attribRingBase, m_builder.getInt32(AttribGranularity), "attribRingBaseOffset");
+
+  assert(userData->getType()->isVectorTy());
+  auto globalTablePtrValue = m_builder.CreateExtractElement(userData, static_cast<uint64_t>(0));
+  auto globalTablePtr = makePointer(globalTablePtrValue, PointerType::get(m_builder.getContext(), ADDR_SPACE_CONST));
+
+  m_attribRingBufDesc = readValueFromCb(FixedVectorType::get(m_builder.getInt32Ty(), 4), globalTablePtr,
+                                        m_builder.getInt32(SiDrvTableOffChipParamCache));
+
+  // Modify the field STRIDE of attribute ring buffer descriptor
+  if (attribCount >= 2) {
+    // STRIDE = WORD1[30:16], STRIDE is initialized to 16 by the driver, which is the right value for attribCount == 1.
+    // We override the value if there are more attributes.
+    auto descWord1 = m_builder.CreateExtractElement(m_attribRingBufDesc, 1);
+    auto stride = m_builder.getInt32(attribCount * SizeOfVec4);
+    if ((attribCount & 1) == 0) {
+      // Clear the bit that was set in STRIDE by the driver.
+      descWord1 = m_builder.CreateAnd(descWord1, ~0x3FFF0000);
+    }
+    descWord1 = m_builder.CreateOr(descWord1, m_builder.CreateShl(stride, 16)); // Set new STRIDE
+    m_attribRingBufDesc = m_builder.CreateInsertElement(m_attribRingBufDesc, descWord1, 1);
+  }
+  m_attribRingBufDesc->setName("attribRingBufDesc");
+}
+
 // =====================================================================================================================
 // Load stream-out info including stream-out buffer descriptors and buffer offsets.
 //
@@ -2561,18 +2613,6 @@ void NggPrimShader::loadStreamOutBufferInfo(Value *userData) {
     return userDataIndex;
   };
 
-  // Helper to make a pointer from its integer address value and the type
-  auto makePointer = [&](Value *ptrValue, Type *ptrTy) {
-    Value *pc = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_getpc, {}, {});
-    pc = m_builder.CreateBitCast(pc, FixedVectorType::get(m_builder.getInt32Ty(), 2));
-
-    Value *ptr = m_builder.CreateInsertElement(pc, ptrValue, static_cast<uint64_t>(0));
-    ptr = m_builder.CreateBitCast(ptr, m_builder.getInt64Ty());
-    ptr = m_builder.CreateIntToPtr(ptr, ptrTy);
-
-    return ptr;
-  };
-
   const auto gsOrEsMain = m_hasGs ? m_gsHandlers.main : m_esHandlers.main;
   StreamOutData streamOutData = {};
   if (m_hasGs)
@@ -2687,7 +2727,7 @@ void NggPrimShader::distributePrimitiveId(Value *primitiveId) {
     } else {
       assert(primitiveType == PrimitiveType::TriangleList || primitiveType == PrimitiveType::TriangleStrip ||
              primitiveType == PrimitiveType::TriangleFan || primitiveType == PrimitiveType::TriangleListAdjacency ||
-             primitiveType == PrimitiveType::TriangleStripAdjacency);
+             primitiveType == PrimitiveType::TriangleStripAdjacency || primitiveType == PrimitiveType::Rect);
       provokingVertexIndex = m_pipelineState->getRasterizerState().provokingVertexMode == ProvokingVertexFirst
                                  ? m_nggInputs.vertexIndex0
                                  : m_nggInputs.vertexIndex2;
@@ -3198,7 +3238,7 @@ void NggPrimShader::runEs(ArrayRef<Argument *> args) {
 
   assert(esArgs.size() == m_esHandlers.main->arg_size()); // Must have visit all arguments of ES entry point
 
-  CallInst *esCall = m_builder.CreateCall(m_esHandlers.main, esArgs);
+  CallInst *esCall = callFunctionHelper(m_esHandlers.main, esArgs, m_builder.GetInsertBlock());
   esCall->setCallingConv(CallingConv::AMDGPU_ES);
 }
 
@@ -3384,7 +3424,7 @@ Value *NggPrimShader::runPartEs(ArrayRef<Argument *> args, Value *position) {
 
   assert(partEsArgs.size() == partEs->arg_size()); // Must have visit all arguments of the part ES
 
-  CallInst *partEsCall = m_builder.CreateCall(partEs, partEsArgs);
+  CallInst *partEsCall = callFunctionHelper(partEs, partEsArgs, m_builder.GetInsertBlock());
   partEsCall->setCallingConv(CallingConv::AMDGPU_ES);
   return partEsCall;
 }
@@ -3838,15 +3878,9 @@ void NggPrimShader::runCopyShader(ArrayRef<Argument *> args) {
   if (m_gfxIp.major >= 11) {
     if (!m_pipelineState->exportAttributeByExportInstruction())
       appendAttributeThroughMemoryArguments(copyShaderArgs);
-
-    // Global table
-    auto userData = args[NumSpecialSgprInputs];
-    assert(userData->getType()->isVectorTy());
-    auto globalTable = m_builder.CreateExtractElement(userData, static_cast<uint64_t>(0)); // The first user data SGPRs
-    copyShaderArgs.push_back(globalTable);
   }
 
-  // Relative vertex index in subgroup
+  // Relative vertex index in subgroup (to access GS-VS ring, without vertex compaction)
   copyShaderArgs.push_back(vertexIndex);
 
   CallInst *copyShaderCall = m_builder.CreateCall(m_gsHandlers.copyShader, copyShaderArgs);
@@ -6085,32 +6119,31 @@ Value *NggPrimShader::ballot(Value *value) {
 
 // =====================================================================================================================
 // Export vertex attribute through memory (ATM) by handing the calls. We mutate the argument list of the target function
-// by adding two additional arguments (one is attribute ring base and the other is relative vertex index in subgroup).
-// Also, we expand all export calls by replacing it with real instructions that do vertex attribute exporting through
-// memory.
+// by adding three additional arguments (attribute ring buffer descriptor, attribute ring base offset, and relative
+// vertex index in subgroup). Also, we expand all export calls by replacing it with real instructions that do vertex
+// attribute exporting through memory.
 //
 // @param [in/out] target : Target function to process vertex attribute export
 void NggPrimShader::exportVertexAttributeThroughMemory(Function *&target) {
   assert(m_gfxIp.major >= 11);                                    // For GFX11+
   assert(!m_pipelineState->exportAttributeByExportInstruction()); // ATM is allowed
 
-  ShaderStageEnum shaderStage =
-      m_hasGs ? ShaderStage::Geometry : (m_hasTes ? ShaderStage::TessEval : ShaderStage::Vertex);
-  const unsigned attribCount = m_pipelineState->getShaderResourceUsage(shaderStage)->inOutUsage.expCount;
-  if (attribCount == 0)
-    return; // No vertex attribute exports
+  if (!m_attribRingBufDesc && !m_attribRingBaseOffset)
+    return; // No ATM, no attributes to export
 
   IRBuilder<>::InsertPointGuard guard(m_builder);
 
   //
   // Mutate the argument list by adding two additional arguments
   //
-  auto newTarget = addFunctionArgs(target, nullptr,
-                                   {
-                                       m_builder.getInt32Ty(), // Attribute ring base (SGPR)
-                                       m_builder.getInt32Ty()  // Relative vertex index in subgroup (VGPR)
-                                   },
-                                   {"attribRingBase", "vertexIndex"}, 0x1);
+  auto newTarget =
+      addFunctionArgs(target, nullptr,
+                      {
+                          FixedVectorType::get(m_builder.getInt32Ty(), 4), // Attribute ring buffer descriptor (4 SGPRs)
+                          m_builder.getInt32Ty(),                          // Attribute ring base offset (SGPR)
+                          m_builder.getInt32Ty()                           // Relative vertex index in subgroup (VGPR)
+                      },
+                      {"attribRingBufDesc", "attribRingBaseOffset", "vertexIndex"}, 0x3);
 
   // Original function is no longer needed
   assert(target->use_empty());
@@ -6121,19 +6154,14 @@ void NggPrimShader::exportVertexAttributeThroughMemory(Function *&target) {
   //
   // Expand vertex attribute export calls by replacing them with real instructions
   //
-  Value *attribRingBufDesc = nullptr;
 
-  // Always the first two arguments, added by us
-  auto attribRingBase = target->getArg(0);
-  auto vertexIndex = target->getArg(1);
+  // Always the first three arguments, added by us
+  auto attribRingBufDesc = target->getArg(0);
+  auto attribRingBaseOffset = target->getArg(1);
+  auto vertexIndex = target->getArg(2);
 
   m_builder.SetInsertPointPastAllocas(target);
 
-  // ringOffset = attribRingBase * 32 * 16
-  //            = attribRingBase * 512
-  static const unsigned AttribGranularity = 32 * SizeOfVec4; // 32 * 16 bytes
-  auto ringOffset = m_builder.CreateMul(attribRingBase, m_builder.getInt32(AttribGranularity));
-
   SmallVector<CallInst *, 8> removedCalls;
 
   for (auto &func : target->getParent()->functions()) {
@@ -6145,45 +6173,21 @@ void NggPrimShader::exportVertexAttributeThroughMemory(Function *&target) {
         if (call->getParent()->getParent() != target)
           continue; // Export call doesn't belong to targeted function, skip
 
-        // NOTE: We always set the insert point before the terminator of the basic block to which this call belongs.
-        // This is because we might modify attribute ring buffer descriptor and this modified descriptor will be used
-        // by subsequent ring buffer store instructions that do vertex attribute exporting.
-        m_builder.SetInsertPoint(call->getParent()->getTerminator());
-
-        if (!attribRingBufDesc) {
-          attribRingBufDesc = call->getArgOperand(0); // Initialize it if necessary
-
-          // Fixup the STRIDE field if necessary, STRIDE = WORD1[30:16].
-          //
-          // STRIDE is initialized to 16 by the driver, which is the right value for attribCount == 1.
-          // We override the value if there are more attributes.
-          if (attribCount > 1) {
-            auto descWord1 = m_builder.CreateExtractElement(attribRingBufDesc, 1);
-            auto stride = m_builder.getInt32(attribCount * SizeOfVec4);
-            if ((attribCount & 1) == 0) {
-              // Clear the bit that was set in STRIDE by the driver.
-              descWord1 = m_builder.CreateAnd(descWord1, ~0x3FFF0000);
-            }
-            descWord1 = m_builder.CreateOr(descWord1, m_builder.CreateShl(stride, 16)); // Set new STRIDE
-            attribRingBufDesc = m_builder.CreateInsertElement(attribRingBufDesc, descWord1, 1);
-          }
-        }
-
-        const unsigned location = cast<ConstantInt>(call->getArgOperand(1))->getZExtValue();
-        auto attribValue = call->getArgOperand(2);
+        m_builder.SetInsertPoint(call);
 
         // Export vertex attributes
-        assert(attribValue->getType() == FixedVectorType::get(m_builder.getFloatTy(), 4)); // Must be <4 xfloat>
-
+        const unsigned location = cast<ConstantInt>(call->getArgOperand(0))->getZExtValue();
         auto locationOffset = m_builder.getInt32(location * SizeOfVec4);
 
+        auto attribValue = call->getArgOperand(1);
+        assert(attribValue->getType() == FixedVectorType::get(m_builder.getFloatTy(), 4)); // Must be <4 xfloat>
+
         CoherentFlag coherent = {};
         if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
           coherent.bits.glc = true;
-          coherent.bits.slc = true;
         }
         m_builder.CreateIntrinsic(Intrinsic::amdgcn_struct_buffer_store, attribValue->getType(),
-                                  {attribValue, attribRingBufDesc, vertexIndex, locationOffset, ringOffset,
+                                  {attribValue, attribRingBufDesc, vertexIndex, locationOffset, attribRingBaseOffset,
                                    m_builder.getInt32(coherent.u32All)});
 
         removedCalls.push_back(call);
@@ -6234,6 +6238,7 @@ void NggPrimShader::exportVertexAttributeThroughMemory(Function *&target) {
 
     // Before the first export call, add s_wait_vscnt 0 to make sure the completion of all attributes being written
     // to the attribute ring buffer
+    assert(!exportCalls.empty()); // Position export is always present
     m_builder.SetInsertPoint(exportCalls[0]);
     m_builder.CreateFence(AtomicOrdering::Release, m_builder.getContext().getOrInsertSyncScopeID("agent"));
   }
@@ -6247,23 +6252,21 @@ void NggPrimShader::exportVertexAttributeThroughMemory(Function *&target) {
 
 // =====================================================================================================================
 // Append additional arguments to the argument list for attribute-through-memory (ATM) of the specified shader stage.
-// Currently, two arguments are required to do attribute-through-memory: (1) the attribute ring base; (2) relative
-// vertex index in NGG subgroup.
+// Currently, three arguments are required to do attribute-through-memory:
+//   (1) Attribute ring buffer descriptor;
+//   (2) Attribute ring base offset;
+//   (3) Relative vertex index in NGG subgroup.
 //
 // @param [in/out] args : The arguments that will be appended to
 void NggPrimShader::appendAttributeThroughMemoryArguments(SmallVectorImpl<llvm::Value *> &args) {
   assert(m_gfxIp.major >= 11);                                    // For GFX11+
   assert(!m_pipelineState->exportAttributeByExportInstruction()); // ATM is allowed
 
-  const auto attribCount =
-      m_pipelineState
-          ->getShaderResourceUsage(m_hasGs ? ShaderStage::Geometry
-                                           : (m_hasTes ? ShaderStage::TessEval : ShaderStage::Vertex))
-          ->inOutUsage.expCount;
-  if (attribCount == 0)
-    return; // No attributes
+  if (!m_attribRingBufDesc && !m_attribRingBaseOffset)
+    return; // No ATM, no attributes to export
 
-  args.push_back(m_nggInputs.attribRingBase);
+  args.push_back(m_attribRingBufDesc);
+  args.push_back(m_attribRingBaseOffset);
   args.push_back(m_nggInputs.threadIdInSubgroup);
 }
 
@@ -7379,13 +7382,7 @@ Value *NggPrimShader::fetchXfbOutput(Function *target, ArrayRef<Argument *> args
   //
   if (m_hasGs) {
     // Copy shader has fixed argument layout
-    Value *userData = args[NumSpecialSgprInputs];
-    assert(userData->getType()->isVectorTy());
-
-    auto globalTable = m_builder.CreateExtractElement(userData, static_cast<uint64_t>(0));
-    return m_builder.CreateCall(xfbFetcher,
-                                {globalTable,                      // Global table
-                                 m_nggInputs.threadIdInSubgroup}); // Relative vertex index in subgroup
+    return m_builder.CreateCall(xfbFetcher, {m_nggInputs.threadIdInSubgroup});
   }
 
   Value *offChipLdsBase = args[ShaderMerger::getSpecialSgprInputIndex(m_gfxIp, EsGs::OffChipLdsBase)];
@@ -7863,6 +7860,24 @@ Value *NggPrimShader::createUBfe(Value *value, unsigned offset, unsigned count)
   return m_builder.CreateAnd(m_builder.CreateLShr(value, offset), (1U << count) - 1);
 }
 
+// =====================================================================================================================
+// Make 64-bit pointer of specified type from 32-bit integer value, extending it with PC.
+//
+// @param ptrValue : 32-bit integer value to extend
+// @param ptrTy : Type that result pointer needs to be
+Value *NggPrimShader::makePointer(Value *ptrValue, Type *ptrTy) {
+  assert(ptrValue->getType()->isIntegerTy(32)); // Must be i32
+
+  Value *pc = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_getpc, {}, {});
+  pc = m_builder.CreateBitCast(pc, FixedVectorType::get(m_builder.getInt32Ty(), 2));
+
+  Value *ptr = m_builder.CreateInsertElement(pc, ptrValue, static_cast<uint64_t>(0));
+  ptr = m_builder.CreateBitCast(ptr, m_builder.getInt64Ty());
+  ptr = m_builder.CreateIntToPtr(ptr, ptrTy);
+
+  return ptr;
+}
+
 // =====================================================================================================================
 // Create a PHI node with the specified incomings.
 //
diff --git a/lgc/patch/NggPrimShader.h b/lgc/patch/NggPrimShader.h
index ff7afecfcf..4f60a3e7a9 100644
--- a/lgc/patch/NggPrimShader.h
+++ b/lgc/patch/NggPrimShader.h
@@ -226,6 +226,7 @@ class NggPrimShader {
   void buildPrimShaderWithGs(llvm::Function *entryPoint);
 
   void initWaveThreadInfo(llvm::Value *mergedGroupInfo, llvm::Value *mergedWaveInfo);
+  void prepareAttribRingAccess(llvm::Value *userData);
   void loadStreamOutBufferInfo(llvm::Value *userData);
   void distributePrimitiveId(llvm::Value *primitiveId);
 
@@ -319,6 +320,7 @@ class NggPrimShader {
 
   llvm::BasicBlock *createBlock(llvm::Function *parent, const llvm::Twine &blockName = "");
   llvm::Value *createUBfe(llvm::Value *value, unsigned offset, unsigned count);
+  llvm::Value *makePointer(llvm::Value *ptrValue, llvm::Type *ptrTy);
   llvm::PHINode *createPhi(llvm::ArrayRef<std::pair<llvm::Value *, llvm::BasicBlock *>> incomings,
                            const llvm::Twine &name = "");
   void createFenceAndBarrier();
@@ -354,7 +356,6 @@ class NggPrimShader {
     llvm::Value *waveIdInSubgroup; // Wave ID in subgroup
     llvm::Value *orderedWaveId;    // Ordered wave ID
 
-    llvm::Value *attribRingBase;                                 // Attribute ring base for this subgroup
     std::pair<llvm::Value *, llvm::Value *> primShaderTableAddr; // Primitive shader table address <low, high>
 
     // VGPRs
@@ -406,9 +407,13 @@ class NggPrimShader {
   unsigned m_maxThreadsPerSubgroup = 0; // Maximum number of threads in a NGG subgroup
   unsigned m_maxWavesPerSubgroup = 0;   // Maximum number of waves in a NGG subgroup
 
+  llvm::Value *m_attribRingBufDesc = nullptr;    // Attribute ring buffer descriptor
+  llvm::Value *m_attribRingBaseOffset = nullptr; // Subgroup's attribute ring base offset (in bytes)
+
   llvm::Value *m_streamOutControlBufPtr = nullptr;                      // Stream-out control buffer pointer
   llvm::Value *m_streamOutBufDescs[MaxTransformFeedbackBuffers] = {};   // Stream-out buffer descriptors
   llvm::Value *m_streamOutBufOffsets[MaxTransformFeedbackBuffers] = {}; // Stream-out buffer offsets
+
   llvm::Value *m_verticesPerPrimitive = nullptr; // If topology is dynamic, it is a SGPR value from user data
                                                  // ComplexData; otherwise it is a constant.
 
diff --git a/lgc/patch/PassRegistry.inc b/lgc/patch/PassRegistry.inc
index 322336a266..ed87a7c05c 100644
--- a/lgc/patch/PassRegistry.inc
+++ b/lgc/patch/PassRegistry.inc
@@ -58,25 +58,26 @@ LLPC_MODULE_PASS("lgc-continufy", Continufy)
 LLPC_MODULE_PASS("lgc-patch-resource-collect", PatchResourceCollect)
 LLPC_MODULE_PASS("lgc-patch-initialize-workgroup-memory", PatchInitializeWorkgroupMemory)
 LLPC_MODULE_PASS("lgc-lower-image-derivatives", LowerImageDerivatives)
-LLPC_MODULE_PASS("lgc-patch-in-out-import-export", PatchInOutImportExport)
+LLPC_MODULE_PASS("lgc-lower-in-out", LowerInOut)
 LLPC_FUNCTION_PASS("lgc-lower-invariant-loads", LowerInvariantLoads)
 LLPC_MODULE_PASS("lgc-patch-setup-target-features", PatchSetupTargetFeatures)
 LLPC_MODULE_PASS("lgc-generate-copy-shader", GenerateCopyShader)
 LLPC_MODULE_PASS("lgc-patch-prepare-pipeline-abi", PatchPreparePipelineAbi)
 LLPC_FUNCTION_PASS("lgc-patch-read-first-lane", PatchReadFirstLane)
-LLPC_MODULE_PASS("lgc-patch-llvm-ir-inclusion", PatchLlvmIrInclusion)
+LLPC_MODULE_PASS("lgc-include-llvm-ir", IncludeLlvmIr)
 LLPC_FUNCTION_PASS("lgc-patch-peephole-opt", PatchPeepholeOpt)
 LLPC_MODULE_PASS("lgc-lower-subgroup-ops", LowerSubgroupOps)
 LLPC_MODULE_PASS("lgc-mutate-entry-point", MutateEntryPoint)
 LLPC_MODULE_PASS("lgc-patch-check-shader-cache", CheckShaderCache)
-LLPC_LOOP_PASS("lgc-patch-loop-metadata", PatchLoopMetadata)
+LLPC_LOOP_PASS("lgc-add-loop-metadata", AddLoopMetadata)
+LLPC_FUNCTION_PASS("lgc-structurize-buffers", StructurizeBuffers)
 LLPC_FUNCTION_PASS("lgc-patch-buffer-op", PatchBufferOp)
 LLPC_MODULE_PASS("lgc-patch-workarounds", PatchWorkarounds)
-LLPC_FUNCTION_PASS("lgc-patch-load-scalarizer", PatchLoadScalarizer)
-LLPC_FUNCTION_PASS("lgc-patch-mul-dx9-zero", PatchMulDx9Zero)
+LLPC_FUNCTION_PASS("lgc-scalarizer-loads", ScalarizeLoads)
+LLPC_FUNCTION_PASS("lgc-lower-mul-dx9-zero", LowerMulDx9Zero)
 LLPC_MODULE_PASS("lgc-patch-null-frag-shader", PatchNullFragShader)
 LLPC_MODULE_PASS("lgc-patch-tcs-passthrough-shader", TcsPassthroughShader)
-LLPC_MODULE_PASS("lgc-patch-image-op-collect", PatchImageOpCollect)
+LLPC_MODULE_PASS("lgc-collect-image-operations", CollectImageOperations)
 LLPC_MODULE_PASS("lgc-vertex-fetch", LowerVertexFetch)
 LLPC_MODULE_PASS("lgc-frag-color-export", LowerFragColorExport)
 LLPC_MODULE_PASS("lgc-lower-debug-printf", LowerDebugPrintf)
diff --git a/lgc/patch/TcsPassthroughShader.cpp b/lgc/patch/PassthroughHullShader.cpp
similarity index 99%
rename from lgc/patch/TcsPassthroughShader.cpp
rename to lgc/patch/PassthroughHullShader.cpp
index 684a213f56..3c86fd8052 100644
--- a/lgc/patch/TcsPassthroughShader.cpp
+++ b/lgc/patch/PassthroughHullShader.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  TcsPassthroughShader.cpp
+ * @file  PassthroughHullShader.cpp
  * @brief LLPC source file: contains declaration and implementation of class lgc::TcsPassthroughShader.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/TcsPassthroughShader.h"
+#include "lgc/patch/PassthroughHullShader.h"
 #include "lgc/LgcContext.h"
 #include "lgc/LgcDialect.h"
 #include "lgc/builder/BuilderImpl.h"
diff --git a/lgc/patch/Patch.cpp b/lgc/patch/Patch.cpp
index bac8f27977..48ea4a2b88 100644
--- a/lgc/patch/Patch.cpp
+++ b/lgc/patch/Patch.cpp
@@ -29,21 +29,24 @@
  ***********************************************************************************************************************
  */
 #include "lgc/patch/Patch.h"
+#include "GenerateNullFragmentShader.h"
 #include "LowerPopsInterlock.h"
 #include "LowerRayQueryWrapper.h"
-#include "PatchNullFragShader.h"
 #include "llvmraytracing/Continuations.h"
 #include "lgc/LgcContext.h"
 #include "lgc/PassManager.h"
 #include "lgc/Pipeline.h"
 #include "lgc/builder/BuilderReplayer.h"
 #include "lgc/patch/AddLoopMetadata.h"
+#include "lgc/patch/ApplyWorkarounds.h"
 #include "lgc/patch/CheckShaderCache.h"
 #include "lgc/patch/CollectImageOperations.h"
+#include "lgc/patch/CollectResourceUsage.h"
 #include "lgc/patch/Continufy.h"
-#include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/FragmentColorExport.h"
 #include "lgc/patch/GenerateCopyShader.h"
 #include "lgc/patch/IncludeLlvmIr.h"
+#include "lgc/patch/LowerBufferOperations.h"
 #include "lgc/patch/LowerDebugPrintf.h"
 #include "lgc/patch/LowerDesc.h"
 #include "lgc/patch/LowerGpuRt.h"
@@ -51,18 +54,16 @@
 #include "lgc/patch/LowerInOut.h"
 #include "lgc/patch/LowerInvariantLoads.h"
 #include "lgc/patch/LowerMulDx9Zero.h"
+#include "lgc/patch/LowerReadFirstLane.h"
 #include "lgc/patch/LowerSubgroupOps.h"
 #include "lgc/patch/MutateEntryPoint.h"
-#include "lgc/patch/PatchBufferOp.h"
+#include "lgc/patch/PassthroughHullShader.h"
 #include "lgc/patch/PatchInitializeWorkgroupMemory.h"
-#include "lgc/patch/PatchPeepholeOpt.h"
-#include "lgc/patch/PatchPreparePipelineAbi.h"
-#include "lgc/patch/PatchReadFirstLane.h"
-#include "lgc/patch/PatchResourceCollect.h"
-#include "lgc/patch/PatchSetupTargetFeatures.h"
-#include "lgc/patch/PatchWorkarounds.h"
+#include "lgc/patch/PeepholeOptimization.h"
+#include "lgc/patch/PreparePipelineAbi.h"
 #include "lgc/patch/ScalarizeLoads.h"
-#include "lgc/patch/TcsPassthroughShader.h"
+#include "lgc/patch/SetupTargetFeatures.h"
+#include "lgc/patch/StructurizeBuffers.h"
 #include "lgc/patch/VertexFetch.h"
 
 #if LLPC_BUILD_STRIX1
@@ -214,11 +215,11 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
   passMgr.addPass(MutateEntryPoint());
   passMgr.addPass(createModuleToFunctionPassAdaptor(LowerPopsInterlock()));
   passMgr.addPass(PatchInitializeWorkgroupMemory());
-  passMgr.addPass(PatchInOutImportExport());
+  passMgr.addPass(LowerInOut());
 
   // Patch invariant load and loop metadata.
   passMgr.addPass(createModuleToFunctionPassAdaptor(LowerInvariantLoads()));
-  passMgr.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(PatchLoopMetadata())));
+  passMgr.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(AddLoopMetadata())));
 
 #if LLPC_BUILD_STRIX1
   passMgr.addPass(WorkaroundDsSubdwordWrite());
@@ -238,7 +239,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
 
   // Collect image operations
   if (pipelineState->getTargetInfo().getGfxIpVersion().major >= 11)
-    passMgr.addPass(PatchImageOpCollect());
+    passMgr.addPass(CollectImageOperations());
 
   // Second part of lowering to "AMDGCN-style"
   passMgr.addPass(PatchPreparePipelineAbi());
@@ -261,6 +262,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
     FunctionPassManager fpm;
     fpm.addPass(PromotePass());
     fpm.addPass(ADCEPass());
+    fpm.addPass(StructurizeBuffers());
     fpm.addPass(PatchBufferOp());
     fpm.addPass(InstCombinePass());
     fpm.addPass(SimplifyCFGPass());
@@ -272,6 +274,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
     }
   } else {
     FunctionPassManager fpm;
+    fpm.addPass(StructurizeBuffers());
     fpm.addPass(PatchBufferOp());
     fpm.addPass(InstCombinePass());
     passMgr.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
@@ -287,7 +290,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
 
   // Include LLVM IR as a separate section in the ELF binary
   if (pipelineState->getOptions().includeIr)
-    passMgr.addPass(PatchLlvmIrInclusion());
+    passMgr.addPass(IncludeLlvmIr());
 
   // Stop timer for patching passes.
   if (patchTimer)
@@ -443,8 +446,8 @@ void Patch::addOptimizationPasses(lgc::PassManager &passMgr, uint32_t optLevel)
   scalarizerOptions.ScalarizeMinBits = 32;
   fpm.addPass(ScalarizerPass(scalarizerOptions));
 #endif
-  fpm.addPass(PatchMulDx9Zero());
-  fpm.addPass(PatchLoadScalarizer());
+  fpm.addPass(LowerMulDx9Zero());
+  fpm.addPass(ScalarizeLoads());
   fpm.addPass(InstSimplifyPass());
   fpm.addPass(NewGVNPass());
   fpm.addPass(BDCEPass());
diff --git a/lgc/patch/PatchPeepholeOpt.cpp b/lgc/patch/PeepholeOptimization.cpp
similarity index 98%
rename from lgc/patch/PatchPeepholeOpt.cpp
rename to lgc/patch/PeepholeOptimization.cpp
index 5cb83f72c3..ced9761d8d 100644
--- a/lgc/patch/PatchPeepholeOpt.cpp
+++ b/lgc/patch/PeepholeOptimization.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  PatchPeepholeOpt.cpp
+ * @file  PeepholeOptimization.cpp
  * @brief LLPC source file: contains implementation of class lgc::PatchPeepholeOpt.
  ***********************************************************************************************************************
  */
-#include "lgc/patch/PatchPeepholeOpt.h"
+#include "lgc/patch/PeepholeOptimization.h"
 #include "lgc/Builder.h"
 #include "lgc/patch/Patch.h"
 #include "llvm/IR/Constants.h"
diff --git a/lgc/patch/PatchPreparePipelineAbi.cpp b/lgc/patch/PreparePipelineAbi.cpp
similarity index 99%
rename from lgc/patch/PatchPreparePipelineAbi.cpp
rename to lgc/patch/PreparePipelineAbi.cpp
index a42f59cc22..0991a5efde 100644
--- a/lgc/patch/PatchPreparePipelineAbi.cpp
+++ b/lgc/patch/PreparePipelineAbi.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
 ***********************************************************************************************************************
-* @file  PatchPreparePipelineAbi.cpp
+* @file  PreparePipelineAbi.cpp
 * @brief LLPC source file: contains implementation of class lgc::PatchPreparePipelineAbi.
 ***********************************************************************************************************************
 */
-#include "lgc/patch/PatchPreparePipelineAbi.h"
+#include "lgc/patch/PreparePipelineAbi.h"
 #include "MeshTaskShader.h"
 #include "RegisterMetadataBuilder.h"
 #include "ShaderMerger.h"
diff --git a/lgc/patch/RegisterMetadataBuilder.cpp b/lgc/patch/RegisterMetadataBuilder.cpp
index c982836fc9..3d15ebe444 100644
--- a/lgc/patch/RegisterMetadataBuilder.cpp
+++ b/lgc/patch/RegisterMetadataBuilder.cpp
@@ -518,6 +518,9 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
     case PrimitiveType::TriangleStripAdjacency:
       gsOutputPrimitiveType = TRISTRIP;
       break;
+    case PrimitiveType::Rect:
+      gsOutputPrimitiveType = RECTLIST__GFX10PLUS;
+      break;
     case PrimitiveType::Patch:
       gsOutputPrimitiveType = POINTLIST;
       break;
@@ -1054,6 +1057,20 @@ void RegisterMetadataBuilder::buildCsRegisters(ShaderStageEnum shaderStage) {
   getComputeRegNode()[Util::Abi::ComputeRegisterMetadataKey::TidigCompCnt] = tidigCompCnt;
 
   setThreadgroupDimensions(workgroupSizes);
+
+  // Only check X dimension of original size
+  if (computeMode.origWorkgroupSizeX) {
+    if (foldWorkgroupXY) {
+      workgroupSizes[0] = computeMode.origWorkgroupSizeX * computeMode.origWorkgroupSizeY;
+      workgroupSizes[1] = computeMode.origWorkgroupSizeZ;
+      workgroupSizes[2] = 1;
+    } else {
+      workgroupSizes[0] = computeMode.origWorkgroupSizeX;
+      workgroupSizes[1] = computeMode.origWorkgroupSizeY;
+      workgroupSizes[2] = computeMode.origWorkgroupSizeZ;
+    }
+    setOrigThreadgroupDimensions(workgroupSizes);
+  }
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/ScalarizeLoads.cpp b/lgc/patch/ScalarizeLoads.cpp
index 964e0a9636..22ea17fd6d 100644
--- a/lgc/patch/ScalarizeLoads.cpp
+++ b/lgc/patch/ScalarizeLoads.cpp
@@ -25,7 +25,7 @@
 /**
  ***********************************************************************************************************************
  * @file  ScalarizeLoads.cpp
- * @brief LLPC source file: contains implementation of class lgc::PatchLoadScalarizer.
+ * @brief LLPC source file: contains implementation of class lgc::ScalarizeLoads.
  ***********************************************************************************************************************
  */
 #include "lgc/patch/ScalarizeLoads.h"
@@ -36,7 +36,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-#define DEBUG_TYPE "lgc-patch-load-scalarizer"
+#define DEBUG_TYPE "lgc-scalarizer-loads"
 
 using namespace lgc;
 using namespace llvm;
@@ -44,7 +44,7 @@ using namespace llvm;
 namespace lgc {
 
 // =====================================================================================================================
-PatchLoadScalarizer::PatchLoadScalarizer() {
+ScalarizeLoads::ScalarizeLoads() {
   m_scalarThreshold = 0;
 }
 
@@ -54,7 +54,7 @@ PatchLoadScalarizer::PatchLoadScalarizer() {
 // @param [in/out] function : Function that we will peephole optimize.
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 // @returns : The preserved analyses (The analyses that are still valid after this pass)
-PreservedAnalyses PatchLoadScalarizer::run(Function &function, FunctionAnalysisManager &analysisManager) {
+PreservedAnalyses ScalarizeLoads::run(Function &function, FunctionAnalysisManager &analysisManager) {
   const auto &moduleAnalysisManager = analysisManager.getResult<ModuleAnalysisManagerFunctionProxy>(function);
   PipelineState *pipelineState =
       moduleAnalysisManager.getCachedResult<PipelineStateWrapper>(*function.getParent())->getPipelineState();
@@ -89,7 +89,7 @@ PreservedAnalyses PatchLoadScalarizer::run(Function &function, FunctionAnalysisM
 // Visits "load" instruction.
 //
 // @param loadInst : The instruction
-void PatchLoadScalarizer::visitLoadInst(LoadInst &loadInst) {
+void ScalarizeLoads::visitLoadInst(LoadInst &loadInst) {
   const unsigned addrSpace = loadInst.getPointerAddressSpace();
   auto loadTy = dyn_cast<FixedVectorType>(loadInst.getType());
 
diff --git a/lgc/patch/PatchSetupTargetFeatures.cpp b/lgc/patch/SetupTargetFeatures.cpp
similarity index 99%
rename from lgc/patch/PatchSetupTargetFeatures.cpp
rename to lgc/patch/SetupTargetFeatures.cpp
index fbcdcda5f8..b5235db7a4 100644
--- a/lgc/patch/PatchSetupTargetFeatures.cpp
+++ b/lgc/patch/SetupTargetFeatures.cpp
@@ -24,11 +24,11 @@
  **********************************************************************************************************************/
 /**
 ***********************************************************************************************************************
-* @file  PatchSetupTargetFeatures.cpp
+* @file  SetupTargetFeatures.cpp
 * @brief LLPC source file: contains declaration and implementation of class lgc::PatchSetupTargetFeatures.
 ***********************************************************************************************************************
 */
-#include "lgc/patch/PatchSetupTargetFeatures.h"
+#include "lgc/patch/SetupTargetFeatures.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
diff --git a/lgc/patch/ShaderInputs.cpp b/lgc/patch/ShaderInputs.cpp
index fd8b6c4a02..a58996ef4f 100644
--- a/lgc/patch/ShaderInputs.cpp
+++ b/lgc/patch/ShaderInputs.cpp
@@ -403,7 +403,7 @@ void ShaderInputs::fixupUses(Module &module, PipelineState *pipelineState, bool
       // However, in some cases, the builtInUsage field is used in NggPrimShader and/or Gfx*ConfigBuilder
       // (both run later on) to tell that the input is in use. For those cases, we must keep the builtInUsage
       // field, and set it here.
-      // Add code here as built-ins are moved from PatchInOutImportExport to InOutBuilder.
+      // Add code here as built-ins are moved from LowerInOut to InOutBuilder.
       auto &builtInUsage = pipelineState->getShaderResourceUsage(stage.value())->builtInUsage;
       switch (stage.value()) {
       case ShaderStage::Vertex:
diff --git a/lgc/patch/ShaderMerger.cpp b/lgc/patch/ShaderMerger.cpp
index 7bf1e63398..47b86b1bf0 100644
--- a/lgc/patch/ShaderMerger.cpp
+++ b/lgc/patch/ShaderMerger.cpp
@@ -31,13 +31,14 @@
 #include "ShaderMerger.h"
 #include "NggPrimShader.h"
 #include "lgc/patch/Patch.h"
-#include "lgc/patch/PatchPreparePipelineAbi.h"
+#include "lgc/patch/PreparePipelineAbi.h"
 #include "lgc/patch/SystemValues.h"
 #include "lgc/state/PalMetadata.h"
 #include "lgc/state/PipelineShaders.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/util/BuilderBase.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -285,14 +286,17 @@ FunctionType *ShaderMerger::generateLsHsEntryPointType(uint64_t *inRegMask) cons
 // @param lsEntryPoint : Entry-point of hardware local shader (LS) (could be null)
 // @param hsEntryPoint : Entry-point of hardware hull shader (HS)
 Function *ShaderMerger::generateLsHsEntryPoint(Function *lsEntryPoint, Function *hsEntryPoint) {
+  bool createDbgInfo = false;
   if (lsEntryPoint) {
     lsEntryPoint->setLinkage(GlobalValue::InternalLinkage);
     lsEntryPoint->addFnAttr(Attribute::AlwaysInline);
+    createDbgInfo |= lsEntryPoint->getSubprogram() != nullptr;
   }
 
   assert(hsEntryPoint);
   hsEntryPoint->setLinkage(GlobalValue::InternalLinkage);
   hsEntryPoint->addFnAttr(Attribute::AlwaysInline);
+  createDbgInfo |= hsEntryPoint->getSubprogram() != nullptr;
 
   processRayQueryLdsStack(lsEntryPoint, hsEntryPoint);
 
@@ -303,7 +307,7 @@ Function *ShaderMerger::generateLsHsEntryPoint(Function *lsEntryPoint, Function
   // because the vertex fetch shader will be prepended to this module and expect the fall through into the merged
   // shader.
   Function *entryPoint = createFunctionHelper(entryPointTy, GlobalValue::ExternalLinkage, hsEntryPoint->getParent(),
-                                              lgcName::LsHsEntryPoint);
+                                              createDbgInfo, lgcName::LsHsEntryPoint);
   entryPoint->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
   setShaderStage(entryPoint, ShaderStage::TessControl);
 
@@ -443,7 +447,7 @@ Function *ShaderMerger::generateLsHsEntryPoint(Function *lsEntryPoint, Function
 
     appendArguments(lsArgs, vertexFetches);
 
-    CallInst *call = builder.CreateCall(lsEntryPoint, lsArgs);
+    CallInst *call = callFunctionHelper(lsEntryPoint, lsArgs, builder.GetInsertBlock());
     call->setCallingConv(CallingConv::AMDGPU_LS);
   }
 
@@ -510,7 +514,7 @@ Function *ShaderMerger::generateLsHsEntryPoint(Function *lsEntryPoint, Function
     hsArgs.push_back(patchId);
     hsArgs.push_back(relPatchId);
 
-    CallInst *call = builder.CreateCall(hsEntryPoint, hsArgs);
+    CallInst *call = callFunctionHelper(hsEntryPoint, hsArgs, builder.GetInsertBlock());
     call->setCallingConv(CallingConv::AMDGPU_HS);
   }
   builder.CreateBr(endHsBlock);
@@ -613,14 +617,17 @@ FunctionType *ShaderMerger::generateEsGsEntryPointType(uint64_t *inRegMask) cons
 // @param esEntryPoint : Entry-point of hardware export shader (ES) (could be null)
 // @param gsEntryPoint : Entry-point of hardware geometry shader (GS)
 Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function *gsEntryPoint) {
+  bool createDbgInfo = false;
   if (esEntryPoint) {
     esEntryPoint->setLinkage(GlobalValue::InternalLinkage);
     esEntryPoint->addFnAttr(Attribute::AlwaysInline);
+    createDbgInfo = esEntryPoint->getSubprogram() != nullptr;
   }
 
   assert(gsEntryPoint);
   gsEntryPoint->setLinkage(GlobalValue::InternalLinkage);
   gsEntryPoint->addFnAttr(Attribute::AlwaysInline);
+  createDbgInfo |= gsEntryPoint->getSubprogram() != nullptr;
 
   processRayQueryLdsStack(esEntryPoint, gsEntryPoint);
 
@@ -634,7 +641,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
   // because the vertex fetch shader will be prepended to this module and expect the fall through into the merged
   // shader.
   Function *entryPoint =
-      createFunctionHelper(entryPointTy, GlobalValue::ExternalLinkage, module, lgcName::EsGsEntryPoint);
+      createFunctionHelper(entryPointTy, GlobalValue::ExternalLinkage, module, createDbgInfo, lgcName::EsGsEntryPoint);
   entryPoint->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
   module->getFunctionList().push_front(entryPoint);
 
@@ -802,7 +809,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
       appendArguments(esArgs, vertexFetches);
     }
 
-    CallInst *call = builder.CreateCall(esEntryPoint, esArgs);
+    CallInst *call = callFunctionHelper(esEntryPoint, esArgs, builder.GetInsertBlock());
     call->setCallingConv(CallingConv::AMDGPU_ES);
   }
   builder.CreateBr(endEsBlock);
@@ -857,7 +864,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
     gsArgs.push_back(esGsOffset5);
     gsArgs.push_back(invocationId);
 
-    CallInst *call = builder.CreateCall(gsEntryPoint, gsArgs);
+    CallInst *call = callFunctionHelper(gsEntryPoint, gsArgs, builder.GetInsertBlock());
     call->setCallingConv(CallingConv::AMDGPU_GS);
   }
   builder.CreateBr(endGsBlock);
diff --git a/lgc/patch/StructurizeBuffers.cpp b/lgc/patch/StructurizeBuffers.cpp
new file mode 100644
index 0000000000..9519a73125
--- /dev/null
+++ b/lgc/patch/StructurizeBuffers.cpp
@@ -0,0 +1,194 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  StructurizeBuffers.cpp
+ * @brief LLPC source file: contains implementation of class lgc::StructurizeBuffers.
+ ***********************************************************************************************************************
+ */
+
+#include "lgc/patch/StructurizeBuffers.h"
+#include "compilerutils/CompilerUtils.h"
+#include "lgc/CommonDefs.h"
+#include "lgc/LgcDialect.h"
+#include "lgc/state/PipelineState.h"
+#include "llvm-dialects/Dialect/Builder.h"
+#include "llvm-dialects/Dialect/Visitor.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "lgc-structurize-buffers"
+
+using namespace llvm;
+using namespace lgc;
+
+namespace {
+
+struct StructurizeBuffersImpl {
+  StructurizeBuffersImpl(Function *function, bool robustBufferAccess);
+
+  bool run();
+  void visitBufferIndex(BufferIndexOp &bufferIndex);
+
+  Function *m_function;
+  llvm_dialects::Builder m_builder;
+  MapVector<Value *, SmallVector<BufferIndexOp *>> bufferIndexOps;
+  bool robustBufferAccess;
+};
+
+} // anonymous namespace
+
+// =====================================================================================================================
+// Executes this LLVM patching pass on the specified LLVM function.
+//
+// @param [in/out] function : LLVM function to be run on
+// @param [in/out] analysisManager : Analysis manager to use for this transformation
+// @returns : The preserved analyses (The analyses that are still valid after this pass)
+PreservedAnalyses StructurizeBuffers::run(Function &function, FunctionAnalysisManager &analysisManager) {
+  const auto &moduleAnalysisManager = analysisManager.getResult<ModuleAnalysisManagerFunctionProxy>(function);
+  PipelineState *pipelineState =
+      moduleAnalysisManager.getCachedResult<PipelineStateWrapper>(*function.getParent())->getPipelineState();
+  bool robustBufferAccess =
+      pipelineState->getOptions().enableExtendedRobustBufferAccess || pipelineState->getOptions().robustBufferAccess;
+  StructurizeBuffersImpl impl(&function, robustBufferAccess);
+
+  if (impl.run())
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+StructurizeBuffersImpl::StructurizeBuffersImpl(Function *function, bool robustBufferAccess)
+    : m_function(function), m_builder(function->getContext()), robustBufferAccess(robustBufferAccess) {
+}
+
+void StructurizeBuffersImpl::visitBufferIndex(BufferIndexOp &bufferIndex) {
+  bufferIndexOps[bufferIndex.getPtr()].push_back(&bufferIndex);
+}
+
+// =====================================================================================================================
+// Executes this LLVM patching pass on the specified LLVM function.
+//
+// @param [in/out] function : LLVM function to be run on
+// @returns : True if the function was modified by the transformation and false otherwise
+bool StructurizeBuffersImpl::run() {
+
+  static const auto visitor =
+      llvm_dialects::VisitorBuilder<StructurizeBuffersImpl>().add(&StructurizeBuffersImpl::visitBufferIndex).build();
+
+  visitor.visit(*this, *m_function);
+
+  if (bufferIndexOps.empty())
+    return false;
+
+  auto isConvertible = [](BufferIndexOp *mark) -> bool {
+    if (isa<ConstantInt>(mark->getIndex()))
+      return false;
+    return mark->getStride() > 4;
+  };
+
+  auto storesBuffer = [&m_builder = m_builder](User *user) -> bool {
+    if (auto store = dyn_cast<StoreInst>(user))
+      return store->getValueOperand()->getType() == m_builder.getPtrTy(ADDR_SPACE_BUFFER_FAT_POINTER);
+
+    return false;
+  };
+
+  auto isSupported = [](User *user) -> bool {
+#if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 506212
+    return isa<LoadInst, StoreInst, SelectInst, AtomicRMWInst, AtomicCmpXchgInst>(user);
+#else
+    return isa<LoadInst, StoreInst, SelectInst>(user);
+#endif
+  };
+
+  SmallVector<Value *> notConvertible;
+  for (auto &base : bufferIndexOps) {
+    if (robustBufferAccess) {
+      notConvertible.push_back(base.first);
+      continue;
+    }
+
+    if (base.first->getType()->getPointerAddressSpace() != ADDR_SPACE_BUFFER_FAT_POINTER) {
+      notConvertible.push_back(base.first);
+      continue;
+    }
+
+    if (llvm::none_of(base.second, isConvertible)) {
+      notConvertible.push_back(base.first);
+      continue;
+    }
+
+    for (auto *bufferIndexOp : base.second) {
+      SmallVector<Value *> worklist;
+      worklist.push_back(bufferIndexOp);
+      bool convertible = true;
+      while (!worklist.empty() && convertible) {
+        auto *current = worklist.pop_back_val();
+
+        for (auto *user : current->users()) {
+          if (isa<GetElementPtrInst>(user)) {
+            worklist.push_back(user);
+            continue;
+          }
+
+          if (!isSupported(user) || storesBuffer(user)) {
+            notConvertible.push_back(base.first);
+            convertible = false;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  SmallVector<Instruction *> toRemove;
+  for (const auto &pointer : bufferIndexOps) {
+    if (llvm::is_contained(notConvertible, pointer.first)) {
+      for (auto *bufferIndexOp : bufferIndexOps[pointer.first]) {
+        m_builder.SetInsertPoint(bufferIndexOp);
+        auto *offset = m_builder.CreateMul(bufferIndexOp->getIndex(), m_builder.getInt32(bufferIndexOp->getStride()));
+        auto *gep = m_builder.CreateGEP(m_builder.getInt8Ty(), bufferIndexOp->getPtr(), offset);
+        bufferIndexOp->replaceAllUsesWith(gep);
+        toRemove.push_back(bufferIndexOp);
+      }
+    } else {
+      for (auto *bufferIndexOp : pointer.second) {
+        m_builder.SetInsertPoint(bufferIndexOp);
+        Value *strided =
+            m_builder.create<ConvertToStridedBufferPointerOp>(bufferIndexOp->getPtr(), bufferIndexOp->getStride());
+        strided = m_builder.create<StridedIndexAddOp>(strided, bufferIndexOp->getIndex());
+
+        toRemove.push_back(bufferIndexOp);
+        CompilerUtils::replaceAllPointerUses(&m_builder, bufferIndexOp, strided, toRemove);
+      }
+    }
+  }
+
+  for (Instruction *I : reverse(toRemove))
+    I->eraseFromParent();
+
+  return true;
+}
diff --git a/lgc/state/PalMetadata.cpp b/lgc/state/PalMetadata.cpp
index 91f900830c..c4b7f8c8a9 100644
--- a/lgc/state/PalMetadata.cpp
+++ b/lgc/state/PalMetadata.cpp
@@ -329,6 +329,9 @@ void PalMetadata::fixUpRegisters() {
         case PrimitiveType::TriangleStripAdjacency:
           gsOutputPrimitiveType = 2; // TRISTRIP
           break;
+        case PrimitiveType::Rect:
+          gsOutputPrimitiveType = 4; // RECTLIST
+          break;
         default:
           llvm_unreachable("Should never be called!");
           break;
diff --git a/lgc/state/PassManagerCache.cpp b/lgc/state/PassManagerCache.cpp
index 25a6325651..87e3ec5447 100644
--- a/lgc/state/PassManagerCache.cpp
+++ b/lgc/state/PassManagerCache.cpp
@@ -31,7 +31,7 @@
 #include "lgc/state/PassManagerCache.h"
 #include "lgc/LgcContext.h"
 #include "lgc/patch/IncludeLlvmIr.h"
-#include "lgc/patch/PatchSetupTargetFeatures.h"
+#include "lgc/patch/SetupTargetFeatures.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 442438
 // Old version of the code
@@ -108,7 +108,7 @@ std::pair<lgc::PassManager &, LegacyPassManager &> PassManagerCache::getPassMana
   fpm.addPass(EarlyCSEPass(true));
   passManagers.first->addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
   passManagers.first->addPass(PatchSetupTargetFeatures());
-  passManagers.first->addPass(PatchLlvmIrInclusion());
+  passManagers.first->addPass(IncludeLlvmIr());
 
   // Add one last pass that does nothing, but invalidates all the analyses.
   // This is required to avoid the pass manager to use results of analyses from
diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp
index 947559e43e..68a8bf98e8 100644
--- a/lgc/state/PipelineState.cpp
+++ b/lgc/state/PipelineState.cpp
@@ -32,7 +32,7 @@
 #include "lgc/CommonDefs.h"
 #include "lgc/LgcContext.h"
 #include "lgc/PassManager.h"
-#include "lgc/patch/FragColorExport.h"
+#include "lgc/patch/FragmentColorExport.h"
 #include "lgc/state/AbiMetadata.h"
 #include "lgc/state/PalMetadata.h"
 #include "lgc/state/TargetInfo.h"
@@ -1311,19 +1311,18 @@ unsigned PipelineState::getNumPatchControlPoints() const {
 // =====================================================================================================================
 // Gets wave size for the specified shader stage
 //
-// NOTE: Need to be called after PatchResourceCollect pass, so usage of subgroupSize is confirmed.
-//
 // @param stage : Shader stage
 unsigned PipelineState::getShaderWaveSize(ShaderStageEnum stage) {
+  if (m_waveSize.empty()) {
+    setAllShadersDefaultWaveSize();
+  }
+
   if (stage == ShaderStage::CopyShader) {
     // Treat copy shader as part of geometry shader
     stage = ShaderStage::Geometry;
   }
 
   assert(ShaderStageMask(ShaderStagesNative).contains(stage));
-  if (!m_waveSize[stage])
-    setShaderDefaultWaveSize(stage);
-
   return getMergedShaderWaveSize(stage);
 }
 
@@ -1491,37 +1490,45 @@ unsigned PipelineState::getShaderHwStageMask(ShaderStageEnum stage) {
 // @param stage : Shader stage
 // @returns : Subgroup size of the specified shader stage
 unsigned PipelineState::getShaderSubgroupSize(ShaderStageEnum stage) {
+  if (m_subgroupSize.empty()) {
+    setAllShadersDefaultWaveSize();
+  }
+
   if (stage == ShaderStage::CopyShader) {
     // Treat copy shader as part of geometry shader
     stage = ShaderStage::Geometry;
   }
 
   assert(stage <= ShaderStage::Compute);
-  if (!m_subgroupSize[stage])
-    setShaderDefaultWaveSize(stage);
-
+  assert(m_subgroupSize[stage]);
   return m_subgroupSize[stage];
 }
 
+// =====================================================================================================================
+// Set the default wave size for all shader stages.
+void PipelineState::setAllShadersDefaultWaveSize() {
+  for (auto stage : ShaderStagesNative)
+    setShaderDefaultWaveSize(stage);
+}
+
 // =====================================================================================================================
 // Set the default wave size for the specified shader stage
 //
 // @param stage : Shader stage
 void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
-  ShaderStageEnum checkingStage = stage;
   if (stage == ShaderStage::Geometry && !hasShaderStage(ShaderStage::Geometry)) {
     // NOTE: For NGG, GS could be absent and VS/TES acts as part of it in the merged shader.
-    // In such cases, we check the property of VS or TES.
-    checkingStage = hasShaderStage(ShaderStage::TessEval) ? ShaderStage::TessEval : ShaderStage::Vertex;
+    // In such cases, we check the property of VS or TES, and this will be handled in getMergedShaderWaveSize.
+    return;
   }
 
-  if (checkingStage == ShaderStage::Compute) {
+  if (stage == ShaderStage::Compute) {
     const unsigned subgroupSize = m_shaderModes.getComputeShaderMode().subgroupSize;
-    m_waveSize[checkingStage] = subgroupSize;
-    m_subgroupSize[checkingStage] = subgroupSize;
+    m_waveSize[stage] = subgroupSize;
+    m_subgroupSize[stage] = subgroupSize;
   }
 
-  if (!m_waveSize[checkingStage]) {
+  if (!m_waveSize[stage]) {
     unsigned waveSize = getTargetInfo().getGpuProperty().waveSize;
     unsigned subgroupSize = waveSize;
 
@@ -1532,7 +1539,7 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
     //  4) If gl_SubgroupSize is not used in the (mesh/task/compute) shader, and the workgroup size is
     //     not larger than 32, use wave size 32.
 
-    if (checkingStage == ShaderStage::Fragment) {
+    if (stage == ShaderStage::Fragment) {
       // Per programming guide, it's recommended to use wave64 for fragment shader.
       waveSize = 64;
     } else if (hasShaderStage(ShaderStage::Geometry)) {
@@ -1551,11 +1558,11 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
     if (getTargetInfo().getGfxIpVersion() >= GfxIpVersion({11}))
       waveSize = 64;
 
-    unsigned waveSizeOption = getShaderOptions(checkingStage).waveSize;
+    unsigned waveSizeOption = getShaderOptions(stage).waveSize;
     if (waveSizeOption != 0) {
       waveSize = waveSizeOption;
 
-      if (checkingStage == ShaderStage::Geometry && getTargetInfo().getGfxIpVersion().major == 10) {
+      if (stage == ShaderStage::Geometry && getTargetInfo().getGfxIpVersion().major == 10) {
         // Legacy (non-GS) HW path for GS does not support wave32 mode. Ignore the settings.
         waveSize = 64;
       }
@@ -1563,14 +1570,13 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
 
     // Note: the conditions below override the tuning option.
     // If workgroup size is not larger than 32, use wave size 32.
-    if (checkingStage == ShaderStage::Mesh || checkingStage == ShaderStage::Task ||
-        checkingStage == ShaderStage::Compute) {
+    if (stage == ShaderStage::Mesh || stage == ShaderStage::Task || stage == ShaderStage::Compute) {
       unsigned workGroupSize;
-      if (checkingStage == ShaderStage::Mesh) {
+      if (stage == ShaderStage::Mesh) {
         auto &mode = m_shaderModes.getMeshShaderMode();
         workGroupSize = mode.workgroupSizeX * mode.workgroupSizeY * mode.workgroupSizeZ;
       } else {
-        assert(checkingStage == ShaderStage::Task || checkingStage == ShaderStage::Compute);
+        assert(stage == ShaderStage::Task || stage == ShaderStage::Compute);
         auto &mode = m_shaderModes.getComputeShaderMode();
         workGroupSize = mode.workgroupSizeX * mode.workgroupSizeY * mode.workgroupSizeZ;
       }
@@ -1582,11 +1588,11 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
     // If subgroup size is used in any shader in the pipeline, use the specified subgroup size.
     if (m_shaderModes.getAnyUseSubgroupSize()) {
       // If allowVaryWaveSize is enabled, subgroupSize is default as zero, initialized as waveSize
-      subgroupSize = getShaderOptions(checkingStage).subgroupSize;
+      subgroupSize = getShaderOptions(stage).subgroupSize;
       // The driver only sets waveSize if a size is requested by an app. We may want to change that in the driver to
       // set subgroupSize instead.
       if (subgroupSize == 0)
-        subgroupSize = getShaderOptions(checkingStage).waveSize;
+        subgroupSize = getShaderOptions(stage).waveSize;
       if (subgroupSize == 0)
         subgroupSize = waveSize;
 
@@ -1600,12 +1606,8 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
     assert(waveSize == 32 || waveSize == 64);
     assert(waveSize <= subgroupSize);
 
-    m_waveSize[checkingStage] = waveSize;
-    m_subgroupSize[checkingStage] = subgroupSize;
-  }
-  if (stage != checkingStage) {
-    m_waveSize[stage] = m_waveSize[checkingStage];
-    m_subgroupSize[stage] = m_subgroupSize[checkingStage];
+    m_waveSize[stage] = waveSize;
+    m_subgroupSize[stage] = subgroupSize;
   }
 }
 
@@ -1613,7 +1615,7 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
 // Whether WGP mode is enabled for the given shader stage
 //
 // @param stage : Shader stage
-bool PipelineState::getShaderWgpMode(ShaderStageEnum stage) const {
+bool PipelineState::getShaderWgpMode(ShaderStageEnum stage) {
   if (stage == ShaderStage::CopyShader) {
     // Treat copy shader as part of geometry shader
     stage = ShaderStage::Geometry;
@@ -1621,7 +1623,20 @@ bool PipelineState::getShaderWgpMode(ShaderStageEnum stage) const {
 
   assert(ShaderStageMask(ShaderStagesNative).contains(stage));
 
-  return m_shaderOptions.lookup(stage).wgpMode;
+  bool wgpMode = m_shaderOptions.lookup(stage).wgpMode;
+  if (!wgpMode) {
+    if (getTargetInfo().getGpuProperty().numComputeUnitsPerShaderEngine > 2) {
+      // Waves will be distributed across both CUs in a WGP with WGP_MODE=1. This is problematic if any
+      // CUs are reserved on devices with only a single WGP (2 CUs).
+      if (m_nggControl.enableNgg && m_nggControl.passthroughMode) {
+        // Performance tests show that NGG passthrough performs best in WGP mode on HW GS.
+        if (!hasShaderStage(ShaderStage::Geometry) && (stage == ShaderStage::Vertex || stage == ShaderStage::TessEval))
+          wgpMode = true;
+      }
+    }
+  }
+
+  return wgpMode;
 }
 
 // =====================================================================================================================
@@ -2011,6 +2026,7 @@ unsigned PipelineState::getVerticesPerPrimitive() {
     case lgc::PrimitiveType::TriangleFan:
     case lgc::PrimitiveType::TriangleListAdjacency:
     case lgc::PrimitiveType::TriangleStripAdjacency:
+    case lgc::PrimitiveType::Rect:
       return 3;
     case lgc::PrimitiveType::Patch:
       return 1;
diff --git a/lgc/state/TargetInfo.cpp b/lgc/state/TargetInfo.cpp
index 0b0aae94ce..fb59c8abcf 100644
--- a/lgc/state/TargetInfo.cpp
+++ b/lgc/state/TargetInfo.cpp
@@ -78,6 +78,7 @@ static void setGfx10BaseInfo(TargetInfo *targetInfo) {
   targetInfo->getGpuProperty().waveSize = 64;
 
   targetInfo->getGpuProperty().numShaderEngines = 4;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 10;
   targetInfo->getGpuProperty().maxSgprsAvailable = 104;
   targetInfo->getGpuProperty().maxVgprsAvailable = 256;
 
@@ -91,20 +92,14 @@ static void setGfx10BaseInfo(TargetInfo *targetInfo) {
   // TODO: Accept gsOnChipDefaultPrimsPerSubgroup from panel option
   targetInfo->getGpuProperty().gsOnChipDefaultPrimsPerSubgroup = 64;
 
-  targetInfo->getGpuProperty().tessFactorBufferSizePerSe = 4096;
-
-  // TODO: Accept gsOnChipDefaultLdsSizePerSubgroup from panel option
-  targetInfo->getGpuProperty().gsOnChipDefaultLdsSizePerSubgroup = 8192;
-
   targetInfo->getGpuProperty().ldsSizePerThreadGroup = 16384;
 
   targetInfo->getGpuProperty().maxSgprsAvailable = 102;
   targetInfo->getGpuProperty().supportsDpp = true;
 
   targetInfo->getGpuProperty().maxUserDataCount = 32;
-  targetInfo->getGpuProperty().gsOnChipDefaultLdsSizePerSubgroup = 0; // GFX9+ does not use this
+  targetInfo->getGpuProperty().gsOnChipDefaultLdsSizePerSubgroup = 8192;
   targetInfo->getGpuProperty().tessFactorBufferSizePerSe = 8192;
-  targetInfo->getGpuProperty().numShaderEngines = 4;
   targetInfo->getGpuProperty().maxMsaaRasterizerSamples = 16;
 }
 
@@ -154,7 +149,6 @@ static void setGfx1010Info(TargetInfo *targetInfo) {
   targetInfo->getGpuWorkarounds().gfx10.waFixBadImageDescriptor = 1;
 }
 
-#if LLPC_BUILD_NAVI12
 // gfx1011
 //
 // @param [in/out] targetInfo : Target info
@@ -180,7 +174,6 @@ static void setGfx1011Info(TargetInfo *targetInfo) {
   targetInfo->getGpuProperty().supportIntegerDotFlag.compBitwidth4 = true;
   targetInfo->getGpuProperty().supportIntegerDotFlag.sameSignedness = true;
 }
-#endif
 
 // gfx1012
 //
@@ -188,6 +181,7 @@ static void setGfx1011Info(TargetInfo *targetInfo) {
 static void setGfx1012Info(TargetInfo *targetInfo) {
   setGfx10Info(targetInfo);
 
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 14;
   targetInfo->getGpuProperty().supportsXnack = 1;
 
   targetInfo->getGpuWorkarounds().gfx10.waShaderInstPrefetch0 = 1;
@@ -251,6 +245,7 @@ static void setGfx1032Info(TargetInfo *targetInfo) {
   setGfx103Info(targetInfo);
 
   targetInfo->getGpuProperty().numShaderEngines = 2;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 8;
   targetInfo->getGpuWorkarounds().gfx10.waClearWriteCompressBit = 1;
 }
 
@@ -262,9 +257,9 @@ static void setGfx1034Info(TargetInfo *targetInfo) {
   setGfx103Info(targetInfo);
 
   targetInfo->getGpuProperty().numShaderEngines = 1;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 8;
 }
 
-#if LLPC_BUILD_REMBRANDT
 // gfx1035
 //
 // @param [in/out] targetInfo : Target info
@@ -273,11 +268,10 @@ static void setGfx1035Info(TargetInfo *targetInfo) {
   setGfx103Info(targetInfo);
 
   targetInfo->getGpuProperty().numShaderEngines = 1;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 6;
   targetInfo->getGpuWorkarounds().gfx10.waClearWriteCompressBit = 1;
 }
-#endif
 
-#if LLPC_BUILD_RAPHAEL || LLPC_BUILD_MENDOCINO
 // gfx1036
 //
 // @param [in/out] targetInfo : Target info
@@ -286,8 +280,8 @@ static void setGfx1036Info(TargetInfo *targetInfo) {
   setGfx103Info(targetInfo);
 
   targetInfo->getGpuProperty().numShaderEngines = 1;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 2;
 }
-#endif
 
 // gfx11
 //
@@ -313,9 +307,9 @@ static void setGfx1100Info(TargetInfo *targetInfo) {
   targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1;
 
   targetInfo->getGpuProperty().numShaderEngines = 6;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 8;
 }
 
-#if LLPC_BUILD_NAVI32
 // gfx1101
 //
 // @param [in/out] targetInfo : Target info
@@ -325,8 +319,8 @@ static void setGfx1101Info(TargetInfo *targetInfo) {
   targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1;
 
   targetInfo->getGpuProperty().numShaderEngines = 3;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 10;
 }
-#endif
 
 // gfx1102
 //
@@ -338,9 +332,9 @@ static void setGfx1102Info(TargetInfo *targetInfo) {
   targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1;
 
   targetInfo->getGpuProperty().numShaderEngines = 2;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 8;
 }
 
-#if LLPC_BUILD_PHOENIX1 || LLPC_BUILD_PHOENIX2
 // gfx1103
 //
 // @param [in/out] targetInfo : Target info
@@ -350,8 +344,8 @@ static void setGfx1103Info(TargetInfo *targetInfo) {
   targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1;
 
   targetInfo->getGpuProperty().numShaderEngines = 1;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 6;
 }
-#endif
 
 #if LLPC_BUILD_STRIX1
 // gfx1150
@@ -363,6 +357,7 @@ static void setGfx1150Info(TargetInfo *targetInfo) {
   targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1;
 
   targetInfo->getGpuProperty().numShaderEngines = 1;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 8;
 }
 
 // gfx115F
@@ -374,6 +369,7 @@ static void setGfx115FInfo(TargetInfo *targetInfo) {
   targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1;
 
   targetInfo->getGpuProperty().numShaderEngines = 1;
+  targetInfo->getGpuProperty().numComputeUnitsPerShaderEngine = 8;
 }
 #endif
 
@@ -386,29 +382,19 @@ struct GpuNameStringMap {
 
 // The supported device list
 static const GpuNameStringMap GpuNameMap[] = {
-    {"gfx1010", "Navi10", &setGfx1010Info}, // gfx1010
-#if LLPC_BUILD_NAVI12
-    {"gfx1011", "Navi12", &setGfx1011Info}, // gfx1011
-#endif
-    {"gfx1012", "Navi14", &setGfx1012Info}, // gfx1012
-    {"gfx1030", "Navi21", &setGfx1030Info}, // gfx1030
-    {"gfx1031", "Navi22", &setGfx1031Info}, // gfx1031
-    {"gfx1032", "Navi23", &setGfx1032Info}, // gfx1032
-    {"gfx1034", "Navi24", &setGfx1034Info}, // gfx1034
-#if LLPC_BUILD_REMBRANDT
+    {"gfx1010", "Navi10", &setGfx1010Info},    // gfx1010
+    {"gfx1011", "Navi12", &setGfx1011Info},    // gfx1011
+    {"gfx1012", "Navi14", &setGfx1012Info},    // gfx1012
+    {"gfx1030", "Navi21", &setGfx1030Info},    // gfx1030
+    {"gfx1031", "Navi22", &setGfx1031Info},    // gfx1031
+    {"gfx1032", "Navi23", &setGfx1032Info},    // gfx1032
+    {"gfx1034", "Navi24", &setGfx1034Info},    // gfx1034
     {"gfx1035", "Rembrandt", &setGfx1035Info}, // gfx1035
-#endif
-#if LLPC_BUILD_RAPHAEL || LLPC_BUILD_MENDOCINO
-    {"gfx1036", "Raphael", &setGfx1036Info}, // gfx1036
-#endif
-    {"gfx1100", "Navi31", &setGfx1100Info}, // gfx1100
-#if LLPC_BUILD_NAVI32
-    {"gfx1101", "Navi32", &setGfx1101Info}, // gfx1101
-#endif
-    {"gfx1102", "Navi33", &setGfx1102Info}, // gfx1102
-#if LLPC_BUILD_PHOENIX1 || LLPC_BUILD_PHOENIX2
-    {"gfx1103", "Phoenix1", &setGfx1103Info}, // gfx1103
-#endif
+    {"gfx1036", "Raphael", &setGfx1036Info},   // gfx1036
+    {"gfx1100", "Navi31", &setGfx1100Info},    // gfx1100
+    {"gfx1101", "Navi32", &setGfx1101Info},    // gfx1101
+    {"gfx1102", "Navi33", &setGfx1102Info},    // gfx1102
+    {"gfx1103", "Phoenix1", &setGfx1103Info},  // gfx1103
 #if LLPC_BUILD_STRIX1
     {"gfx1150", "Strix1", &setGfx1150Info},    // gfx1150
     {"gfx115F", "Strix1 A0", &setGfx115FInfo}, // gfx115F
diff --git a/lgc/test/CleanUndefOutputValues.lgc b/lgc/test/CleanUndefOutputValues.lgc
new file mode 100644
index 0000000000..67135bf7a5
--- /dev/null
+++ b/lgc/test/CleanUndefOutputValues.lgc
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5
+; Check the case that undef output value cannot be removed since the location may be re-written with valid value later.
+
+; RUN: lgc -mcpu=gfx1010 -passes=lgc-patch-resource-collect -o - %s -o - | FileCheck --check-prefixes=CHECK1 %s
+
+; Function Attrs: alwaysinline nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !23 !lgc.shaderstage !19 {
+; CHECK1-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
+; CHECK1-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META21:![0-9]+]] !lgc.shaderstage [[META18:![0-9]+]] {
+; CHECK1-NEXT:  [[_ENTRY:.*:]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.f32(i32 2, i32 0, float 2.000000e+00) #[[ATTR1:[0-9]+]]
+; CHECK1-NEXT:    ret void
+;
+.entry:
+  call void @lgc.output.export.generic.i32.i32.f32(i32 1, i32 0, float undef) #1
+  call void @lgc.output.export.generic.i32.i32.f32(i32 2, i32 0, float 2.000000e+00) #1
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define dllexport spir_func void @lgc.shader.TCS.main() local_unnamed_addr #0 !spirv.ExecutionModel !19 !lgc.shaderstage !24 {
+; CHECK1-LABEL: define dllexport spir_func void @lgc.shader.TCS.main(
+; CHECK1-SAME: ) local_unnamed_addr #[[ATTR0]] !spirv.ExecutionModel [[META18]] !lgc.shaderstage [[META22:![0-9]+]] {
+; CHECK1-NEXT:  [[_ENTRY:.*:]]
+; CHECK1-NEXT:    [[POSITION:%.*]] = call <4 x float> @lgc.input.import.builtin.Position.v4f32.i32.i32.i32(i32 0, i32 -1, i32 0) #[[ATTR2:[0-9]+]]
+; CHECK1-NEXT:    [[INVOCATIONID:%.*]] = call i32 @lgc.input.import.builtin.InvocationId.i32.i32.i32.i32(i32 8, i32 -1, i32 -1) #[[ATTR2]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.v4f32(i32 0, i32 0, i32 0, i32 [[INVOCATIONID]], <4 x float> [[POSITION]]) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 0, i32 -1, float 2.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 1, i32 -1, float undef) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 0, i32 -1, float 2.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 1, i32 -1, float 1.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 1, i32 -1, float 2.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 1, i32 -1, float undef) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 1, i32 -1, float 2.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 1, i32 -1, float 3.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 2, i32 -1, float 4.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 2, i32 -1, float undef) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 2, i32 -1, float 4.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 2, i32 -1, float 3.000000e+00) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 3, i32 -1, float 1.000000e+01) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 3, i32 -1, float 1.000000e+01) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 3, i32 -1, float 1.200000e+01) #[[ATTR1]]
+; CHECK1-NEXT:    call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 3, i32 -1, float undef) #[[ATTR1]]
+; CHECK1-NEXT:    ret void
+;
+.entry:
+  %Position = call <4 x float> @lgc.input.import.builtin.Position.v4f32.i32.i32.i32(i32 0, i32 -1, i32 0) #2
+  %InvocationId = call i32 @lgc.input.import.builtin.InvocationId.i32.i32.i32.i32(i32 8, i32 -1, i32 -1) #2
+  call void @lgc.output.export.generic.i32.i32.i32.i32.v4f32(i32 0, i32 0, i32 0, i32 %InvocationId, <4 x float> %Position) #1
+  call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 0, i32 -1, float 2.000000e+00) #1
+  call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 1, i32 -1, float undef) #1
+  call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 0, i32 -1, float 2.000000e+00) #1
+  call void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 11, i32 1, i32 -1, float 1.000000e+00) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 1, i32 -1, float 2.000000e+00) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 1, i32 -1, float undef) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 1, i32 -1, float 2.000000e+00) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 1, i32 -1, float 3.000000e+00) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 2, i32 -1, float 4.000000e+00) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 2, i32 -1, float undef) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 2, i32 -1, float 4.000000e+00) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 2, i32 -1, float 3.000000e+00) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 3, i32 -1, float 1.000000e+01) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 1, i32 0, i32 3, i32 -1, float 1.000000e+01) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 3, i32 -1, float 1.200000e+01) #1
+  call void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 2, i32 0, i32 3, i32 -1, float undef) #1
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @lgc.output.export.generic.i32.i32.f32(i32 %0, i32 %1, float %2) #1
+
+; Function Attrs: nounwind
+declare void @lgc.output.export.builtin.Position.i32.v4f32(i32 %0, <4 x float> %1) #1
+
+; Function Attrs: nounwind
+declare void @lgc.output.export.builtin.TessLevelOuter.i32.i32.i32.f32(i32 %0, i32 %1, i32 %2, float %3) #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.input.import.builtin.Position.v4f32.i32.i32.i32(i32 %0, i32 %1, i32 %2) #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.input.import.builtin.InvocationId.i32.i32.i32.i32(i32 %0, i32 %1, i32 %2) #2
+
+; Function Attrs: nounwind
+declare void @lgc.output.export.generic.i32.i32.v4f32(i32 %0, i32 %1, <4 x float> %2) #1
+
+; Function Attrs: nounwind
+declare void @lgc.output.export.generic.i32.i32.i32.i32.f32(i32 %0, i32 %1, i32 %2, i32 %3, float %4) #1
+
+; Function Attrs: nounwind
+declare void @lgc.output.export.generic.i32.i32.i32.i32.v4f32(i32 %0, i32 %1, i32 %2, i32 %3, <4 x float> %4) #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare float @lgc.input.import.builtin.TessLevelOuter.f32.i32.i32.i32(i32 %0, i32 %1, i32 %2) #2
+
+attributes #0 = { alwaysinline nounwind "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind willreturn memory(none) }
+attributes #4 = { "target-features"=",+wavefrontsize64" }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.TCS = !{!3}
+!lgc.user.data.nodes = !{!5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17}
+!lgc.input.assembly.state = !{!18}
+!lgc.rasterizer.state = !{!19}
+!amdgpu.pal.metadata.msgpack = !{!20}
+!llpc.tcs.mode = !{!21}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 508543144, i32 -1032098044, i32 896427787, i32 -948777858, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216, i32 0, i32 0, i32 0, i32 65536, i32 0, i32 0, i32 0, i32 0, i32 256, i32 256}
+!2 = !{i32 -1243396572, i32 -1672945329, i32 -1993252358, i32 -718371714, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -310777470, i32 1140270792, i32 -1043142173, i32 -627491623, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{i32 -825390044, i32 1262025594, i32 -412777296, i32 1953628532, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!5 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 0, i32 1, i32 0}
+!6 = !{!"DescriptorTableVaPtr", i32 7, i32 14, i32 9, i32 1, i32 3}
+!7 = !{!"DescriptorBufferCompact", i32 10, i32 14, i32 0, i32 2, i64 93, i32 17, i32 2}
+!8 = !{!"DescriptorBuffer", i32 6, i32 14, i32 2, i32 8, i64 93, i32 0, i32 4}
+!9 = !{!"DescriptorBuffer", i32 6, i32 14, i32 10, i32 8, i64 93, i32 1, i32 4}
+!10 = !{!"StreamOutTableVaPtr", i32 11, i32 16, i32 2, i32 1, i32 0}
+!11 = !{!"DescriptorTableVaPtr", i32 7, i32 14, i32 6, i32 1, i32 1}
+!12 = !{!"DescriptorSampler", i32 2, i32 14, i32 0, i32 8192, i64 0, i32 0, i32 4}
+!13 = !{!"DescriptorTableVaPtr", i32 7, i32 14, i32 7, i32 1, i32 2}
+!14 = !{!"DescriptorBuffer", i32 6, i32 14, i32 0, i32 16, i64 1, i32 0, i32 4}
+!15 = !{!"DescriptorMutable", i32 17, i32 14, i32 16, i32 8000000, i64 1, i32 1, i32 8}
+!16 = !{!"DescriptorTableVaPtr", i32 7, i32 14, i32 8, i32 1, i32 1}
+!17 = !{!"DescriptorMutable", i32 17, i32 14, i32 0, i32 8000000, i64 2, i32 0, i32 8}
+!18 = !{i32 10}
+!19 = !{i32 1}
+!20 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CFS\9C]\90\16\8B\C8\FD\CF\C9\BE\A9g\07\B7\8B\B2\AD.llpc_version\A475.4\AEamdpal.version\92\03\00"}
+!21 = !{i32 1, i32 0, i32 3, i32 0, i32 1, i32 1}
+!22 = !{i32 0, i32 0, i32 3}
+!23 = !{i32 0}
+!24 = !{i32 2}
+
+;.
+; CHECK1: [[META18]] = !{i32 1}
+; CHECK1: [[META21]] = !{i32 0}
+; CHECK1: [[META22]] = !{i32 2}
+;.
diff --git a/lgc/test/ImageSampleNoReturn.lgc b/lgc/test/ImageSampleNoReturn.lgc
index 06cfae3042..0cbd540fc4 100644
--- a/lgc/test/ImageSampleNoReturn.lgc
+++ b/lgc/test/ImageSampleNoReturn.lgc
@@ -40,6 +40,6 @@ attributes #1 = { nounwind readnone }
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4)
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP4]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP4]], i32 -1) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP4]], align 4, !invariant.load !3
-; CHECK-NEXT:    call void @llvm.amdgcn.image.sample.2d.nortn.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> [[TMP5]], <4 x i32> <i32 12288, i32 117436416, i32 1750073344, i32 -2147483648>, i1 false, i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.sample.2d.nortn.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH0000, <8 x i32> [[TMP5]], <4 x i32> <i32 12288, i32 117436416, i32 1750073344, i32 -2147483648>, i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/lgc/test/PatchInvalidImageDescriptor.lgc b/lgc/test/PatchInvalidImageDescriptor.lgc
index debed60f1e..e79b7bd696 100644
--- a/lgc/test/PatchInvalidImageDescriptor.lgc
+++ b/lgc/test/PatchInvalidImageDescriptor.lgc
@@ -8,17 +8,17 @@
 ; GFX1010-NEXT: and i32
 ; GFX1010-NEXT: select i1
 ; GFX1010-NEXT: [[PATCHED_DESC0:%[.a-zA-Z0-9]+]] = insertelement <8 x i32> %{{[0-9]+}}
-; GFX1010:  %.load = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 1, <8 x i32> [[PATCHED_DESC0]], i32 0, i32 0)
+; GFX1010:  %.load = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32{{(\.v8i32)?}}(i32 15, i32 1, <8 x i32> [[PATCHED_DESC0]], i32 0, i32 0)
 
-; GFX1010: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 0, i32 0, <8 x i32> %{{[0-9]+}}, i32 0, i32 0)
+; GFX1010: call void @llvm.amdgcn.image.store.2d.v4f32.i32{{(\.v8i32)?}}(<4 x float> zeroinitializer, i32 15, i32 0, i32 0, <8 x i32> %{{[0-9]+}}, i32 0, i32 0)
 
-; GFX1010: %.sample = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
+; GFX1010: %.sample = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
 
-; GFX1010: %.gather = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
+; GFX1010: %.gather = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
 
-; GFX1010: %.atomic = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 1, i32 0, <8 x i32> %{{[0-9]+}}, i32 0, i32 0)
+; GFX1010: %.atomic = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32{{(\.v8i32)?}}(i32 1, i32 0, <8 x i32> %{{[0-9]+}}, i32 0, i32 0)
 
-; GFX1010: %.lod = call <2 x float> @llvm.amdgcn.image.getlod.2d.v2f32.f32(i32 3, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
+; GFX1010: %.lod = call <2 x float> @llvm.amdgcn.image.getlod.2d.v2f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 3, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
 
 ; CHECK: [[WFDESC:%[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane
 ; GFX1010: extractelement <8 x i32> [[WFDESC]], i64 3
@@ -27,7 +27,7 @@
 ; GFX1010-NEXT: select i1
 ; GFX1010-NEXT: [[PATCHED_DESC1:%[.a-zA-Z0-9]+]] = insertelement <8 x i32> [[WFDESC]]
 ; GFX1010: [[WFDESC1:%[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 %{{[0-9]+}}, <8 x i32> [[PATCHED_DESC1]])
-; GFX1010: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 0, <8 x i32> [[WFDESC1]], i32 0, i32 0)
+; GFX1010: call void @llvm.amdgcn.image.store.1d.v4f32.i32{{(\.v8i32)?}}(<4 x float> zeroinitializer, i32 15, i32 0, <8 x i32> [[WFDESC1]], i32 0, i32 0)
 ; ModuleID = 'lgcPipeline'
 source_filename = "lgcPipeline"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
diff --git a/lgc/test/TestWaterfallLoopForStruct.lgc b/lgc/test/TestWaterfallLoopForStruct.lgc
index 915a969554..d4e8f34c44 100644
--- a/lgc/test/TestWaterfallLoopForStruct.lgc
+++ b/lgc/test/TestWaterfallLoopForStruct.lgc
@@ -93,7 +93,7 @@ attributes #2 = { nounwind willreturn memory(read) }
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP13]], align 4, !invariant.load !12
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP15]], <8 x i32> [[TMP14]])
-; CHECK-NEXT:    [[TMP17:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 1, i32 1, <8 x i32> [[TMP16]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP17:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32{{(\.v8i32)?}}(i32 15, i32 1, i32 1, <8 x i32> [[TMP16]], i32 1, i32 0)
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, i32 } [[TMP17]], 0
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP15]], <4 x float> [[TMP18]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <4 x float>, i32 } [[TMP17]], 1
diff --git a/lgc/test/Transforms/Continufy/simple.lgc b/lgc/test/Transforms/Continufy/simple.lgc
index d03d75ebd0..31e0f40330 100644
--- a/lgc/test/Transforms/Continufy/simple.lgc
+++ b/lgc/test/Transforms/Continufy/simple.lgc
@@ -12,7 +12,7 @@ define spir_func void @raygen() !lgc.shaderstage !{i32 7} !continufy.stage !{i32
 ; CHECK-NEXT:    [[DST:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[P16]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 [[TMP2]], i32 8, i32 poison, i32 [[X]], ptr addrspace(1) [[DST]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 [[TMP2]], i32 8, i32 poison, i32 [[X]], ptr addrspace(1) [[DST]]), !continuation.returnedRegistercount [[META3]]
 ; CHECK-NEXT:    store [2 x i32] [[TMP3]], ptr addrspace(1) [[DST]], align 4
 ; CHECK-NEXT:    call void @lgc.cps.complete()
 ; CHECK-NEXT:    unreachable
@@ -35,8 +35,8 @@ define spir_func i32 @chs(i32 %x) !lgc.shaderstage !{i32 7} !continufy.stage !{i
 ; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.cps.await__i32(i32 [[TMP2]], i32 4, i32 poison, i32 [[X]])
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 8, {} poison, i32 poison, i32 poison, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.cps.await__i32(i32 [[TMP2]], i32 4, i32 poison, i32 [[X]]), !continuation.returnedRegistercount [[META3]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, i32 [[TMP3]])
 ; CHECK-NEXT:    unreachable
 ;
   %pushconst = call ptr addrspace(4) @lgc.user.data(i32 24)
@@ -58,7 +58,7 @@ define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{i32 7} {
 ; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FN]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP0]], 1
-; CHECK-NEXT:    call void (...) @lgc.cps.await__isVoid(i32 [[TMP1]], i32 2, i32 poison)
+; CHECK-NEXT:    call void (...) @lgc.cps.await__isVoid(i32 [[TMP1]], i32 2, i32 poison), !continuation.returnedRegistercount [[META3]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @lgc.cps.complete()
diff --git a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
index bc1b2c82e5..d5d9442fe1 100644
--- a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
+++ b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
@@ -16,7 +16,7 @@ define void @test({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc.shadersta
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[TMP6]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4
-; CHECK-NEXT:    [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP8]], align 4
+; CHECK-NEXT:    [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP8]], align 4, !amdgpu.last.use [[META4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr addrspace(5) [[TMP8]] to i32
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = extractvalue { i32 } [[CPS_STATE]], 0
@@ -53,7 +53,7 @@ define void @test({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc.shadersta
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP33]], i1 true)
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP18]], i32 [[TMP34]])
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP18]], i32 [[TMP34]])
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP18]], [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP35]])
@@ -99,7 +99,7 @@ entry:
   %then.arg = add i32 %arg, 1
   %v.then = mul i32 %v, 2
   %state.then = insertvalue {i32} poison, i32 %v.then, 0
-  call void (i32, i32, { i32 }, ...) @lgc.cps.jump(i32 %cr.then, i32 2, {i32} %state.then, i32 %then.arg)
+  call void (i32, i32, { i32 }, ...) @lgc.cps.jump(i32 %cr.then, i32 2, {i32} %state.then, i32 poison, i32 %then.arg)
   unreachable
 }
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
index bc0b8750ba..9e6f8bd5c9 100644
--- a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
@@ -47,7 +47,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP21]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP22]], i1 true)
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP20]], i32 [[TMP23]])
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP20]], i32 [[TMP23]])
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP20]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
 ; CHECK-NEXT:    [[TMP27:%.*]] = and i32 [[TMP24]], -64
@@ -102,7 +102,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
 
   %p32 = call ptr addrspace(32) @lgc.cps.get.vsp()
 
-  call void (i32, i32, { i32 }, ...) @lgc.cps.jump(i32 %cr, i32 1, {i32} %state, i32 %arg, ptr addrspace(32) %p32)
+  call void (i32, i32, { i32 }, ...) @lgc.cps.jump(i32 %cr, i32 1, {i32} %state, i32 poison, i32 %arg, ptr addrspace(32) %p32)
   unreachable
 }
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
index bfaeb3c10d..833e7518cf 100644
--- a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
@@ -6,7 +6,7 @@
 ; Function Attrs: alwaysinline nounwind
 define spir_func void @_rgen_1({} %state, i32 %rcr) #0 !spirv.ExecutionModel !15 !lgc.shaderstage !16 !continuation !18 !lgc.cps !17 {
 ; CHECK-LABEL: define amdgpu_cs_chain void @_rgen_1(
-; CHECK-SAME: i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[USERDATA1:%.*]], i32 inreg [[USERDATA2:%.*]], i32 inreg [[USERDATA3:%.*]], i32 inreg [[USERDATA4:%.*]], i32 inreg [[USERDATA5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[RCR:%.*]]) #[[ATTR0:[0-9]+]] align 64 !spirv.ExecutionModel !15 !lgc.shaderstage [[META16:![0-9]+]] !continuation [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] {
+; CHECK-SAME: i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[USERDATA1:%.*]], i32 inreg [[USERDATA2:%.*]], i32 inreg [[USERDATA3:%.*]], i32 inreg [[USERDATA4:%.*]], i32 inreg [[USERDATA5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[RCR:%.*]]) #[[ATTR0:[0-9]+]] align 64 !spirv.ExecutionModel [[META15:![0-9]+]] !lgc.shaderstage [[META16:![0-9]+]] !continuation [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -106,7 +106,7 @@ define spir_func void @_rgen_1({} %state, i32 %rcr) #0 !spirv.ExecutionModel !15
 ; CHECK-NEXT:    [[TMP88:%.*]] = icmp ne i32 [[TMP87]], 0
 ; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP87]], i32 [[TMP85]]
 ; CHECK-NEXT:    [[TMP90:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP89]], i1 true)
-; CHECK-NEXT:    [[TMP91:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP74]], i32 [[TMP90]])
+; CHECK-NEXT:    [[TMP91:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP74]], i32 [[TMP90]])
 ; CHECK-NEXT:    [[TMP92:%.*]] = icmp eq i32 [[TMP74]], [[TMP91]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP92]])
 ; CHECK-NEXT:    [[TMP94:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP91]])
@@ -203,13 +203,13 @@ define spir_func void @_rgen_1({} %state, i32 %rcr) #0 !spirv.ExecutionModel !15
   %51 = or i32 %50, 1
   %52 = inttoptr i32 %51 to ptr
   %53 = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @_rgen_1.resume.0)
-  call void (...) @lgc.cps.jump(i32 %51, i32 2, {} poison, i32 %53, [1 x i32] undef, i32 %39)
+  call void (...) @lgc.cps.jump(i32 %51, i32 2, {} poison, i32 poison, i32 %53, [1 x i32] undef, i32 %39)
   unreachable
 }
 
 define void @_rgen_1.resume.0({} %0, i32 %1, [1 x i32] %2) !spirv.ExecutionModel !15 !lgc.shaderstage !16 !continuation !18 !lgc.cps !17 {
 ; CHECK-LABEL: define amdgpu_cs_chain void @_rgen_1.resume.0(
-; CHECK-SAME: i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[USERDATA1:%.*]], i32 inreg [[USERDATA2:%.*]], i32 inreg [[USERDATA3:%.*]], i32 inreg [[USERDATA4:%.*]], i32 inreg [[USERDATA5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[TMP0:%.*]], [1 x i32] [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] align 64 !spirv.ExecutionModel !15 !lgc.shaderstage [[META16]] !continuation [[META17]] !lgc.cps [[META18]] {
+; CHECK-SAME: i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[USERDATA1:%.*]], i32 inreg [[USERDATA2:%.*]], i32 inreg [[USERDATA3:%.*]], i32 inreg [[USERDATA4:%.*]], i32 inreg [[USERDATA5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[TMP0:%.*]], [1 x i32] [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] align 64 !spirv.ExecutionModel [[META15]] !lgc.shaderstage [[META16]] !continuation [[META17]] !lgc.cps [[META18]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -266,7 +266,7 @@ define void @_rgen_1.resume.0({} %0, i32 %1, [1 x i32] %2) !spirv.ExecutionModel
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
 ; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP45]], i32 [[TMP43]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP47]], i1 true)
-; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP32]], i32 [[TMP48]])
+; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP32]], i32 [[TMP48]])
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i32 [[TMP32]], [[TMP49]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP50]])
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP49]])
@@ -389,6 +389,7 @@ attributes #7 = { nounwind willreturn memory(inaccessiblemem: read) }
 ;
 ;
 ;.
+; CHECK: [[META15]] = !{i32 5313}
 ; CHECK: [[META16]] = !{i32 7}
 ; CHECK: [[META17]] = !{ptr @_rgen_1}
 ; CHECK: [[META18]] = !{i32 1}
diff --git a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
index 8486eac1cb..b3937b9a27 100644
--- a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
@@ -69,7 +69,7 @@ define void @test.0({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} {
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 ; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP42]], i32 [[TMP40]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP44]], i1 true)
-; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP29]], i32 [[TMP45]])
+; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP29]], i32 [[TMP45]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i32 [[TMP29]], [[TMP46]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP47]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP46]])
@@ -123,7 +123,7 @@ define void @test.0({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} {
   %state = insertvalue { ptr addrspace(32) } poison, ptr addrspace(32) %p2, 0
 
   %cr = call i32 @lgc.cps.as.continuation.reference__i32(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, { ptr addrspace(32) } %state, ptr addrspace(32) %p2, i32 %q1)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, { ptr addrspace(32) } %state, i32 poison, ptr addrspace(32) %p2, i32 %q1)
   unreachable
 }
 
@@ -171,7 +171,7 @@ define void @test.1({} %no_state, ptr addrspace(32) %p2, i32 %q1) !lgc.cps !{i32
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 ; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP33]], i32 [[TMP31]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP35]], i1 true)
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP20]], i32 [[TMP36]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP20]], i32 [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP20]], [[TMP37]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
 ; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
@@ -215,7 +215,7 @@ define void @test.1({} %no_state, ptr addrspace(32) %p2, i32 %q1) !lgc.cps !{i32
   %n99 = load i8, ptr addrspace(32) %p2
 
   %cr = call i32 @lgc.cps.as.continuation.reference__i32(ptr @test.2)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison, i32 poison)
   unreachable
 }
 
@@ -269,7 +269,7 @@ define void @test.2({ ptr addrspace(32) } %state) !lgc.cps !{i32 1} !lgc.shaders
 ; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 [[TMP34]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP38]], i1 true)
-; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP23]], i32 [[TMP39]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 [[TMP39]])
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP23]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP41]])
 ; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP40]])
@@ -392,7 +392,7 @@ define void @test.gep({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} {
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP51]], i32 [[TMP49]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP53]], i1 true)
-; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP38]], i32 [[TMP54]])
+; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP38]], i32 [[TMP54]])
 ; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i32 [[TMP38]], [[TMP55]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP56]])
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP55]])
@@ -453,7 +453,7 @@ define void @test.gep({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} {
   store i32 %vsp.3.i, ptr addrspace(32) %3
 
   %cr = call i32 @lgc.cps.as.continuation.reference__i32(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison, i32 poison, ptr addrspace(32) %vsp.3, i32 %vsp.3.i)
   unreachable
 }
 
@@ -509,7 +509,7 @@ define void @test.nested.gep({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i3
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
 ; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP39]], i32 [[TMP37]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP41]], i1 true)
-; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP26]], i32 [[TMP42]])
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 [[TMP42]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP26]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP44]])
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]])
@@ -558,7 +558,7 @@ define void @test.nested.gep({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i3
   store i32 %vsp.i, ptr addrspace(32) %1
 
   %cr = call i32 @lgc.cps.as.continuation.reference__i32(ptr @test.1)
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison, ptr addrspace(32) %vsp, i32 %vsp.i)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison, i32 poison, ptr addrspace(32) %vsp, i32 %vsp.i)
   unreachable
 }
 
@@ -607,7 +607,7 @@ define void @test.i64.reference({} %no_state, ptr addrspace(32) %p2, i32 %q1) !l
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 ; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP33]], i32 [[TMP31]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP35]], i1 true)
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP20]], i32 [[TMP36]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP20]], i32 [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP20]], [[TMP37]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
 ; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
@@ -652,7 +652,7 @@ define void @test.i64.reference({} %no_state, ptr addrspace(32) %p2, i32 %q1) !l
 
   %cr64 = call i64 @lgc.cps.as.continuation.reference__i64(ptr @test.2)
   %cr = trunc i64 %cr64 to i32
-  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison)
+  call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison, i32 poison)
   unreachable
 }
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
index d057bd62b7..e38b780fe7 100644
--- a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
@@ -16,7 +16,7 @@ define void @unify_jumps({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc.sh
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[TMP6]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4
-; CHECK-NEXT:    [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP8]], align 4
+; CHECK-NEXT:    [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP8]], align 4, !amdgpu.last.use [[META4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr addrspace(5) [[TMP8]] to i32
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = extractvalue { i32 } [[CPS_STATE]], 0
@@ -69,7 +69,7 @@ define void @unify_jumps({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc.sh
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
 ; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP39]], i32 [[TMP37]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP41]], i1 true)
-; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP26]], i32 [[TMP42]])
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 [[TMP42]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP26]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP44]])
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]])
@@ -119,14 +119,14 @@ then:
   %then.arg = add i32 %arg, 1
   %v.then = mul i32 %v, 2
   %state.then = insertvalue {i32} poison, i32 %v.then, 0
-  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, {i32} %state.then, i32 %then.arg)
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, {i32} %state.then, i32 poison, i32 %then.arg)
   unreachable
 
 else:
   %table.1 = getelementptr i32, ptr %table, i32 1
   %cr.else = load i32, ptr %table.1
   %else.arg = uitofp i32 %arg to float
-  call void (...) @lgc.cps.jump(i32 %cr.else, i32 2, {} poison, float %else.arg, i32 5)
+  call void (...) @lgc.cps.jump(i32 %cr.else, i32 2, {} poison, i32 poison, float %else.arg, i32 5)
   unreachable
 }
 
@@ -143,7 +143,7 @@ define void @unify_jump_ret({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[TMP6]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4
-; CHECK-NEXT:    [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP8]], align 4
+; CHECK-NEXT:    [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP8]], align 4, !amdgpu.last.use [[META4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr addrspace(5) [[TMP8]] to i32
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = extractvalue { i32 } [[CPS_STATE]], 0
@@ -188,7 +188,7 @@ define void @unify_jump_ret({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 ; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP32]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP36]], i1 true)
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.readlane{{(.i32)?}}(i32 [[TMP21]], i32 [[TMP37]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP21]], i32 [[TMP37]])
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[TMP21]], [[TMP38]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP39]])
 ; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP38]])
@@ -243,7 +243,7 @@ then:
   %then.arg = add i32 %arg, 1
   %v.then = mul i32 %v, 2
   %state.then = insertvalue {i32} poison, i32 %v.then, 0
-  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, {i32} %state.then, i32 %then.arg)
+  call void (...) @lgc.cps.jump(i32 %cr.then, i32 2, {i32} %state.then, i32 poison, i32 %then.arg)
   unreachable
 
 else:
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc
index 943299a9bc..3cbc07b42d 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc
@@ -9,9 +9,9 @@ define <8 x float> @convert_f16_to_accumulator(<8 x float> %fact) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[FACT:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], <8 x i32> zeroinitializer, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[RESHAPE16BIT:%.*]] = lshr <8 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[RESHAPE16BIT1:%.*]] = bitcast <8 x i32> [[RESHAPE16BIT]] to <8 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[RESHAPE16BIT1]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[ACCUM1:%.*]] = lshr <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[ACCUM2:%.*]] = bitcast <8 x i32> [[ACCUM1]] to <8 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[ACCUM2]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[TMP7]]
 ;
   %accum = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %fact, i32 1, i32 1, i32 0, i32 1)
@@ -28,28 +28,28 @@ define <8 x float> @convert_f16_to_factor(<8 x float> %accum) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP7]], i32 [[TMP8]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP7]], i32 [[TMP8]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP10]], i32 [[TMP11]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP10]], i32 [[TMP11]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP13]], i32 [[TMP14]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP13]], i32 [[TMP14]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP16]], i32 [[TMP17]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP16]], i32 [[TMP17]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP19]], i32 [[TMP20]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP19]], i32 [[TMP20]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP22]], i32 [[TMP23]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP22]], i32 [[TMP23]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP25]], i32 [[TMP26]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP25]], i32 [[TMP26]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.permlanex16{{(.i32)?}}(i32 [[TMP28]], i32 [[TMP29]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[TMP28]], i32 [[TMP29]], i32 1985229328, i32 -19088744, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> poison, i32 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP12]], i64 1
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP15]], i64 2
@@ -86,6 +86,193 @@ define <8 x i32> @convert_f16_to_bf16(<8 x float> %acc) {
   ret <8 x i32> %fConvert
 }
 
+define <8 x float> @convert_u4_to_f16(<2 x i32> %load) {
+; CHECK-LABEL: @convert_u4_to_f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[LOAD:%.*]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i8> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP3]], 15
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i8> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], 15
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i8 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i8> [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = and i8 [[TMP9]], 15
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr i8 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i8> [[TMP2]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], 15
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr i8 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i8> [[TMP2]], i64 4
+; CHECK-NEXT:    [[TMP16:%.*]] = and i8 [[TMP15]], 15
+; CHECK-NEXT:    [[TMP17:%.*]] = lshr i8 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i8> [[TMP2]], i64 5
+; CHECK-NEXT:    [[TMP19:%.*]] = and i8 [[TMP18]], 15
+; CHECK-NEXT:    [[TMP20:%.*]] = lshr i8 [[TMP18]], 4
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x i8> [[TMP2]], i64 6
+; CHECK-NEXT:    [[TMP22:%.*]] = and i8 [[TMP21]], 15
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr i8 [[TMP21]], 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i8> [[TMP2]], i64 7
+; CHECK-NEXT:    [[TMP25:%.*]] = and i8 [[TMP24]], 15
+; CHECK-NEXT:    [[TMP26:%.*]] = lshr i8 [[TMP24]], 4
+; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i8 [[TMP4]] to half
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x half> poison, half [[TMP27]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = uitofp i8 [[TMP5]] to half
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x half> [[TMP28]], half [[TMP29]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = uitofp i8 [[TMP7]] to half
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x half> [[TMP30]], half [[TMP31]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i8 [[TMP8]] to half
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <16 x half> [[TMP32]], half [[TMP33]], i64 3
+; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i8 [[TMP10]] to half
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <16 x half> [[TMP34]], half [[TMP35]], i64 4
+; CHECK-NEXT:    [[TMP37:%.*]] = uitofp i8 [[TMP11]] to half
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <16 x half> [[TMP36]], half [[TMP37]], i64 5
+; CHECK-NEXT:    [[TMP39:%.*]] = uitofp i8 [[TMP13]] to half
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x half> [[TMP38]], half [[TMP39]], i64 6
+; CHECK-NEXT:    [[TMP41:%.*]] = uitofp i8 [[TMP14]] to half
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x half> [[TMP40]], half [[TMP41]], i64 7
+; CHECK-NEXT:    [[TMP43:%.*]] = uitofp i8 [[TMP16]] to half
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <16 x half> [[TMP42]], half [[TMP43]], i64 8
+; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i8 [[TMP17]] to half
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <16 x half> [[TMP44]], half [[TMP45]], i64 9
+; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i8 [[TMP19]] to half
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <16 x half> [[TMP46]], half [[TMP47]], i64 10
+; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i8 [[TMP20]] to half
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <16 x half> [[TMP48]], half [[TMP49]], i64 11
+; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i8 [[TMP22]] to half
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x half> [[TMP50]], half [[TMP51]], i64 12
+; CHECK-NEXT:    [[TMP53:%.*]] = uitofp i8 [[TMP23]] to half
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <16 x half> [[TMP52]], half [[TMP53]], i64 13
+; CHECK-NEXT:    [[TMP55:%.*]] = uitofp i8 [[TMP25]] to half
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <16 x half> [[TMP54]], half [[TMP55]], i64 14
+; CHECK-NEXT:    [[TMP57:%.*]] = uitofp i8 [[TMP26]] to half
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x half> [[TMP56]], half [[TMP57]], i64 15
+; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <16 x half> [[TMP58]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[TMP59]]
+;
+  %convert = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 43, <2 x i32> %load, i32 10, i32 1, i32 0, i32 0)
+  ret <8 x float> %convert
+}
+
+define <2 x i32> @convert_u8_to_u4(<4 x i32> %load) {
+; CHECK-LABEL: @convert_u8_to_u4(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[LOAD:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i8> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP3]], 15
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i8> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i8 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i8> [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = and i8 [[TMP7]], 15
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i8> [[TMP2]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i8 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP2]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = and i8 [[TMP11]], 15
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i8> [[TMP2]], i64 5
+; CHECK-NEXT:    [[TMP14:%.*]] = shl i8 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP2]], i64 6
+; CHECK-NEXT:    [[TMP16:%.*]] = and i8 [[TMP15]], 15
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i8> [[TMP2]], i64 7
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i8 [[TMP17]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i8> [[TMP2]], i64 8
+; CHECK-NEXT:    [[TMP20:%.*]] = and i8 [[TMP19]], 15
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i8> [[TMP2]], i64 9
+; CHECK-NEXT:    [[TMP22:%.*]] = shl i8 [[TMP21]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i8> [[TMP2]], i64 10
+; CHECK-NEXT:    [[TMP24:%.*]] = and i8 [[TMP23]], 15
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i8> [[TMP2]], i64 11
+; CHECK-NEXT:    [[TMP26:%.*]] = shl i8 [[TMP25]], 4
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP2]], i64 12
+; CHECK-NEXT:    [[TMP28:%.*]] = and i8 [[TMP27]], 15
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i8> [[TMP2]], i64 13
+; CHECK-NEXT:    [[TMP30:%.*]] = shl i8 [[TMP29]], 4
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP2]], i64 14
+; CHECK-NEXT:    [[TMP32:%.*]] = and i8 [[TMP31]], 15
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i8> [[TMP2]], i64 15
+; CHECK-NEXT:    [[TMP34:%.*]] = shl i8 [[TMP33]], 4
+; CHECK-NEXT:    [[TMP35:%.*]] = or i8 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <8 x i8> poison, i8 [[TMP35]], i64 0
+; CHECK-NEXT:    [[TMP37:%.*]] = or i8 [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <8 x i8> [[TMP36]], i8 [[TMP37]], i64 1
+; CHECK-NEXT:    [[TMP39:%.*]] = or i8 [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <8 x i8> [[TMP38]], i8 [[TMP39]], i64 2
+; CHECK-NEXT:    [[TMP41:%.*]] = or i8 [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <8 x i8> [[TMP40]], i8 [[TMP41]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = or i8 [[TMP20]], [[TMP22]]
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <8 x i8> [[TMP42]], i8 [[TMP43]], i64 4
+; CHECK-NEXT:    [[TMP45:%.*]] = or i8 [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <8 x i8> [[TMP44]], i8 [[TMP45]], i64 5
+; CHECK-NEXT:    [[TMP47:%.*]] = or i8 [[TMP28]], [[TMP30]]
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <8 x i8> [[TMP46]], i8 [[TMP47]], i64 6
+; CHECK-NEXT:    [[TMP49:%.*]] = or i8 [[TMP32]], [[TMP34]]
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <8 x i8> [[TMP48]], i8 [[TMP49]], i64 7
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP50]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[TMP51]]
+;
+  %u4Convert = call <2 x i32> (...) @lgc.cooperative.matrix.convert__v2i32(i32 38, <4 x i32> %load, i32 3, i32 10, i32 0, i32 0)
+  ret <2 x i32> %u4Convert
+}
+
+define <2 x i32> @convert_fp16_to_i4(<8 x i32> %load) {
+; CHECK-LABEL: @convert_fp16_to_i4(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[LOAD:%.*]] to <16 x half>
+; CHECK-NEXT:    [[CONVERTINTOINT32:%.*]] = fptosi <16 x half> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i32> [[CONVERTINTOINT32]] to <16 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP4]], 15
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i8> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i8 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = and i8 [[TMP8]], 15
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i8> [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = shl i8 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[TMP3]], i64 4
+; CHECK-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], 15
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[TMP3]], i64 5
+; CHECK-NEXT:    [[TMP15:%.*]] = shl i8 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[TMP3]], i64 6
+; CHECK-NEXT:    [[TMP17:%.*]] = and i8 [[TMP16]], 15
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[TMP3]], i64 7
+; CHECK-NEXT:    [[TMP19:%.*]] = shl i8 [[TMP18]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[TMP3]], i64 8
+; CHECK-NEXT:    [[TMP21:%.*]] = and i8 [[TMP20]], 15
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[TMP3]], i64 9
+; CHECK-NEXT:    [[TMP23:%.*]] = shl i8 [[TMP22]], 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[TMP3]], i64 10
+; CHECK-NEXT:    [[TMP25:%.*]] = and i8 [[TMP24]], 15
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[TMP3]], i64 11
+; CHECK-NEXT:    [[TMP27:%.*]] = shl i8 [[TMP26]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[TMP3]], i64 12
+; CHECK-NEXT:    [[TMP29:%.*]] = and i8 [[TMP28]], 15
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[TMP3]], i64 13
+; CHECK-NEXT:    [[TMP31:%.*]] = shl i8 [[TMP30]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[TMP3]], i64 14
+; CHECK-NEXT:    [[TMP33:%.*]] = and i8 [[TMP32]], 15
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[TMP3]], i64 15
+; CHECK-NEXT:    [[TMP35:%.*]] = shl i8 [[TMP34]], 4
+; CHECK-NEXT:    [[TMP36:%.*]] = or i8 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <8 x i8> poison, i8 [[TMP36]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = or i8 [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[TMP38]], i64 1
+; CHECK-NEXT:    [[TMP40:%.*]] = or i8 [[TMP13]], [[TMP15]]
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP40]], i64 2
+; CHECK-NEXT:    [[TMP42:%.*]] = or i8 [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <8 x i8> [[TMP41]], i8 [[TMP42]], i64 3
+; CHECK-NEXT:    [[TMP44:%.*]] = or i8 [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <8 x i8> [[TMP43]], i8 [[TMP44]], i64 4
+; CHECK-NEXT:    [[TMP46:%.*]] = or i8 [[TMP25]], [[TMP27]]
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP46]], i64 5
+; CHECK-NEXT:    [[TMP48:%.*]] = or i8 [[TMP29]], [[TMP31]]
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <8 x i8> [[TMP47]], i8 [[TMP48]], i64 6
+; CHECK-NEXT:    [[TMP50:%.*]] = or i8 [[TMP33]], [[TMP35]]
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <8 x i8> [[TMP49]], i8 [[TMP50]], i64 7
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP51]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[TMP52]]
+;
+  %u4Convert = call <2 x i32> (...) @lgc.cooperative.matrix.convert__v2i32(i32 42, <8 x i32> %load, i32 1, i32 10, i32 0, i32 0)
+  ret <2 x i32> %u4Convert
+}
+
 declare i1 @getcc()
 declare <8 x float> @process1(<8 x float>)
 
@@ -93,4 +280,6 @@ declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
 declare <8 x float> @lgc.cooperative.matrix.transpose__v8f32(...)
 declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...)
 declare <8 x i32> @lgc.cooperative.matrix.convert__v8i32(...)
+declare <8 x float> @lgc.cooperative.matrix.convert_v8i32(...)
+declare <2 x i32> @lgc.cooperative.matrix.convert__v2i32(...)
 declare void @lgc.cooperative.matrix.store(...)
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
index 1d245d0a41..53ede1663b 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
@@ -281,8 +281,54 @@ define <8 x i32> @test_i32_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMode
   ret <8 x i32> %a
 }
 
+define <2 x i32> @test_i4_ab_layout(ptr addrspace(7) %ptr) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: @test_i4_ab_layout(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = srem i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 128
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load atomic i8, ptr addrspace(7) [[TMP7]] unordered, align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load atomic i8, ptr addrspace(7) [[TMP11]] unordered, align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP12]], i64 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load atomic i8, ptr addrspace(7) [[TMP15]] unordered, align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP5]], 3
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load atomic i8, ptr addrspace(7) [[TMP19]] unordered, align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP20]], i64 3
+; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load atomic i8, ptr addrspace(7) [[TMP23]] unordered, align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP24]], i64 4
+; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP5]], 5
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load atomic i8, ptr addrspace(7) [[TMP27]] unordered, align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP28]], i64 5
+; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP5]], 6
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load atomic i8, ptr addrspace(7) [[TMP31]] unordered, align 2
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[TMP32]], i64 6
+; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP5]], 7
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load atomic i8, ptr addrspace(7) [[TMP35]] unordered, align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP36]], i64 7
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[TMP38]]
+;
+  %a = call <2 x i32> (...) @lgc.cooperative.matrix.load__v2i32(ptr addrspace(7) %ptr, i32 128, i1 true, i32 10, i32 0, i32 2, i32 16)
+  ret <2 x i32> %a
+}
 declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
 declare <8 x i32> @lgc.cooperative.matrix.load__v8i32(...)
+declare <2 x i32> @lgc.cooperative.matrix.load__v2i32(...)
 
 !llpc.compute.mode = !{!0}
 !lgc.client = !{!1}
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc
index fa424765bf..36f6c26b56 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc
@@ -3,7 +3,7 @@
 
 define <8 x float> @test_pack_f16(<8 x float> %a, <8 x float> %b) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_pack_f16
-; CHECK-SAME: (<8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6:![0-9]+]] {
+; CHECK-SAME: (<8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) !spirv.ExecutionModel [[META5:![0-9]+]] !lgc.shaderstage [[META6:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[A]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[B]] to <16 x half>
@@ -18,7 +18,7 @@ entry:
 
 define <8 x float> @test_unpack_lo(<8 x float> %packed) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_unpack_lo
-; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6]] {
+; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel [[META5]] !lgc.shaderstage [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[PACKED]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 poison, i32 6, i32 poison, i32 8, i32 poison, i32 10, i32 poison, i32 12, i32 poison, i32 14, i32 poison>
@@ -32,7 +32,7 @@ entry:
 
 define <8 x float> @test_unpack_hi(<8 x float> %packed) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_unpack_hi
-; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6]] {
+; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel [[META5]] !lgc.shaderstage [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[PACKED]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <16 x i32> <i32 1, i32 poison, i32 3, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 9, i32 poison, i32 11, i32 poison, i32 13, i32 poison, i32 15, i32 poison>
@@ -46,7 +46,7 @@ entry:
 
 define <8 x float> @test_packed_times_scalar(<8 x float> %packed, <2 x half> %scalar) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_packed_times_scalar
-; CHECK-SAME: (<8 x float> [[PACKED:%.*]], <2 x half> [[SCALAR:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6]] {
+; CHECK-SAME: (<8 x float> [[PACKED:%.*]], <2 x half> [[SCALAR:%.*]]) !spirv.ExecutionModel [[META5]] !lgc.shaderstage [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[PACKED]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/lgc/test/Transforms/PatchBufferOp/buffer-index-op.lgc b/lgc/test/Transforms/PatchBufferOp/buffer-index-op.lgc
new file mode 100644
index 0000000000..7e25b78036
--- /dev/null
+++ b/lgc/test/Transforms/PatchBufferOp/buffer-index-op.lgc
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 2
+; RUN: lgc --mcpu=gfx1100 -o - -passes='require<lgc-pipeline-state>,function(lgc-structurize-buffers)' %s | FileCheck --check-prefixes=GFX11 %s
+
+define [12 x i32] @strided_buffer_mark_const_zero_idx(ptr addrspace(7) %buf) {
+; GFX11-LABEL: define [12 x i32] @strided_buffer_mark_const_zero_idx
+; GFX11-SAME: (ptr addrspace(7) [[BUF:%.*]]) {
+; GFX11-NEXT:  entry:
+; GFX11-NEXT:    [[GEP_RESTORE:%.*]] = getelementptr i8, ptr addrspace(7) [[BUF]], i32 0
+; GFX11-NEXT:    [[RES:%.*]] = load [12 x i32], ptr addrspace(7) [[GEP_RESTORE]], align 4
+; GFX11-NEXT:    ret [12 x i32] [[RES]]
+;
+entry:
+  %elem = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) %buf, i32 48, i32 0) #0
+  %res = load [12 x i32], ptr addrspace(7) %elem, align 4
+  ret [12 x i32] %res
+}
+
+define [12 x i32] @strided_buffer_mark_const_nonzero_idx(ptr addrspace(7) %buf) {
+; GFX11-LABEL: define [12 x i32] @strided_buffer_mark_const_nonzero_idx
+; GFX11-SAME: (ptr addrspace(7) [[BUF:%.*]]) {
+; GFX11-NEXT:  entry:
+; GFX11-NEXT:    [[GEP_RESTORE:%.*]] = getelementptr i8, ptr addrspace(7) [[BUF]], i32 576
+; GFX11-NEXT:    [[RES:%.*]] = load [12 x i32], ptr addrspace(7) [[GEP_RESTORE]], align 4
+; GFX11-NEXT:    ret [12 x i32] [[RES]]
+;
+entry:
+  %elem = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) %buf, i32 48, i32 12) #0
+  %res = load [12 x i32], ptr addrspace(7) %elem, align 4
+  ret [12 x i32] %res
+}
+
+define [12 x i32] @strided_buffer_mark_nonconst_idx(ptr addrspace(7) %buf, i32 %idx) {
+; GFX11-LABEL: define [12 x i32] @strided_buffer_mark_nonconst_idx
+; GFX11-SAME: (ptr addrspace(7) [[BUF:%.*]], i32 [[IDX:%.*]]) {
+; GFX11-NEXT:  entry:
+; GFX11-NEXT:    [[TMP0:%.*]] = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) [[BUF]], i32 48)
+; GFX11-NEXT:    [[TMP1:%.*]] = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) [[TMP0]], i32 [[IDX]])
+; GFX11-NEXT:    [[RES:%.*]] = load [12 x i32], ptr addrspace(9) [[TMP1]], align 4
+; GFX11-NEXT:    ret [12 x i32] [[RES]]
+;
+entry:
+  %elem = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) %buf, i32 48, i32 %idx) #0
+  %res = load [12 x i32], ptr addrspace(7) %elem, align 4
+  ret [12 x i32] %res
+}
+
+define i32 @strided_buffer_mark_small_stride(ptr addrspace(7) %buf, i32 %idx) {
+; GFX11-LABEL: define i32 @strided_buffer_mark_small_stride
+; GFX11-SAME: (ptr addrspace(7) [[BUF:%.*]], i32 [[IDX:%.*]]) {
+; GFX11-NEXT:  entry:
+; GFX11-NEXT:    [[TMP0:%.*]] = mul i32 [[IDX]], 4
+; GFX11-NEXT:    [[GEP_RESTORE:%.*]] = getelementptr i8, ptr addrspace(7) [[BUF]], i32 [[TMP0]]
+; GFX11-NEXT:    [[RES:%.*]] = load i32, ptr addrspace(7) [[GEP_RESTORE]], align 4
+; GFX11-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %elem = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) %buf, i32 4, i32 %idx) #0
+  %res = load i32, ptr addrspace(7) %elem, align 4
+  ret i32 %res
+}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7), i32, i32) local_unnamed_addr #0
+
+attributes #0 = { nounwind willreturn memory(none) }
diff --git a/lgc/test/Transforms/PatchBufferOp/buffer.atomic.ops.lgc b/lgc/test/Transforms/PatchBufferOp/buffer.atomic.ops.lgc
new file mode 100644
index 0000000000..a8624aead7
--- /dev/null
+++ b/lgc/test/Transforms/PatchBufferOp/buffer.atomic.ops.lgc
@@ -0,0 +1,471 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -o - -passes='require<lgc-pipeline-state>,function(lgc-patch-buffer-op)' %s | FileCheck --check-prefixes=CHECK %s
+
+define amdgpu_gfx void @raw_atomic_load(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomic_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 5)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %load = load atomic i32, ptr addrspace(7) %ptr monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_xchg(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_xchg(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.swap.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %xchg = atomicrmw xchg ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_add(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_add(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %add = atomicrmw add ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_sub(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_sub(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.sub.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %sub = atomicrmw sub ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_and(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_and(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.and.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %and = atomicrmw and ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_or(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_or(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.or.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %or = atomicrmw or ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_xor(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.xor.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %xor = atomicrmw xor ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_smax(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_smax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.smax.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %smax = atomicrmw max ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_smin(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_smin(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.smin.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %smin = atomicrmw min ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_umax(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_umax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.umax.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %umax = atomicrmw umax ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_umin(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_umin(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.buffer.atomic.umin.i64(i64 1, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %umin = atomicrmw umin ptr addrspace(7) %ptr, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_fadd(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_fadd(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float 1.000000e+00, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %fadd = atomicrmw fadd ptr addrspace(7) %ptr, float 1.0 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_fmax(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_fmax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float 1.000000e+00, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %fmax = atomicrmw fmax ptr addrspace(7) %ptr, float 1.0 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @raw_atomicrmw_fmin(<4 x i32> inreg %desc) !lgc.shaderstage !0 {
+; CHECK-LABEL: @raw_atomicrmw_fmin(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float 1.000000e+00, <4 x i32> [[DESC:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %fmin = atomicrmw fmin ptr addrspace(7) %ptr, float 1.0 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomic_load(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomic_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 5)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %load = load atomic i32, ptr addrspace(9) %struct.ptr.idx monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_xchg(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_xchg(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.swap.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %xchg = atomicrmw xchg ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_add(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_add(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %add = atomicrmw add ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_sub(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_sub(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.sub.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %sub = atomicrmw sub ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_and(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_and(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.and.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %and = atomicrmw and ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_or(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_or(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.or.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %or = atomicrmw or ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_xor(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.xor.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %xor = atomicrmw xor ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_smax(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_smax(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.smax.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %smax = atomicrmw max ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_smin(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_smin(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.smin.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %smin = atomicrmw min ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_umax(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_umax(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.umax.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %umax = atomicrmw umax ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_umin(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_umin(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.amdgcn.struct.buffer.atomic.umin.i64(i64 1, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %umin = atomicrmw umin ptr addrspace(9) %struct.ptr.idx, i64 1 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_fadd(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_fadd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float 1.000000e+00, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %fadd = atomicrmw fadd ptr addrspace(9) %struct.ptr.idx, float 1.0 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_fmax(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_fmax(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.struct.buffer.atomic.fmax.f32(float 1.000000e+00, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %fmax = atomicrmw fmax ptr addrspace(9) %struct.ptr.idx, float 1.0 monotonic, align 8
+  ret void
+}
+
+define amdgpu_gfx void @struct_atomicrmw_fmin(<4 x i32> inreg %desc, i32 %index) !lgc.shaderstage !0 {
+; CHECK-LABEL: @struct_atomicrmw_fmin(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[DESC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073676289
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 524288
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[DESC]], i32 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -805306369
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 268435456
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.struct.buffer.atomic.fmin.f32(float 1.000000e+00, <4 x i32> [[TMP11]], i32 [[INDEX:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc)
+  %struct.ptr = call ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7) %ptr, i32 8)
+  %struct.ptr.idx = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %struct.ptr, i32 %index)
+  %fmin = atomicrmw fmin ptr addrspace(9) %struct.ptr.idx, float 1.0 monotonic, align 8
+  ret void
+}
+
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) nounwind readnone
+declare ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7), i32, i32)
+declare ptr addrspace(9) @lgc.convert.to.strided.buffer.pointer(ptr addrspace(7), i32)
+declare ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9), i32)
+
+!0 = !{i32 7}
diff --git a/lgc/test/Transforms/ReadFirstLane/simple.lgc b/lgc/test/Transforms/ReadFirstLane/simple.lgc
index a1669d741a..5312b0e0c5 100644
--- a/lgc/test/Transforms/ReadFirstLane/simple.lgc
+++ b/lgc/test/Transforms/ReadFirstLane/simple.lgc
@@ -4,7 +4,7 @@
 define i32 @simple(i32 %x) {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[Y:%.*]] = mul i32 [[X:%.*]], 2
-; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.amdgcn.readfirstlane{{(.i32)?}}(i32 [[Y]])
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[Y]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %y = mul i32 %x, 2
@@ -15,9 +15,9 @@ define i32 @simple(i32 %x) {
 define i32 @simple2(i32 %x) {
 ; CHECK-LABEL: @simple2(
 ; CHECK-NEXT:    [[A:%.*]] = mul i32 [[X:%.*]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane{{(.i32)?}}(i32 [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[A]])
 ; CHECK-NEXT:    [[B:%.*]] = add i32 [[TMP1]], 2
-; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.amdgcn.readfirstlane{{(.i32)?}}(i32 [[B]])
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[B]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = mul i32 %x, 5
@@ -30,10 +30,11 @@ define <2 x i32> @vec2(ptr addrspace(4) %ptr) {
 ; CHECK-LABEL: @vec2(
 ; CHECK-NEXT:    [[PTR_OFS:%.*]] = getelementptr <2 x i32>, ptr addrspace(4) [[PTR:%.*]], i32 2
 ; CHECK-NEXT:    [[DESC:%.*]] = load <2 x i32>, ptr addrspace(4) [[PTR_OFS]], align 16
-; CHECK-NEXT:    [[DESC_0:%.*]] = extractelement <2 x i32> [[DESC]], i32 0
-; CHECK-NEXT:    [[DESC_1:%.*]] = extractelement <2 x i32> [[DESC]], i32 1
-; CHECK-NEXT:    [[RF_0:%.*]] = call i32 @llvm.amdgcn.readfirstlane{{(.i32)?}}(i32 [[DESC_0]])
-; CHECK-NEXT:    [[RF_1:%.*]] = call i32 @llvm.amdgcn.readfirstlane{{(.i32)?}}(i32 [[DESC_1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[DESC]])
+; CHECK-NEXT:    [[DESC_0:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[DESC_1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[RF_0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[DESC_0]])
+; CHECK-NEXT:    [[RF_1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[DESC_1]])
 ; CHECK-NEXT:    [[OUT_0:%.*]] = insertelement <2 x i32> poison, i32 [[RF_0]], i32 0
 ; CHECK-NEXT:    [[OUT_1:%.*]] = insertelement <2 x i32> [[OUT_0]], i32 [[RF_1]], i32 1
 ; CHECK-NEXT:    ret <2 x i32> [[OUT_1]]
@@ -49,4 +50,26 @@ define <2 x i32> @vec2(ptr addrspace(4) %ptr) {
   ret <2 x i32> %out.1
 }
 
-declare i32 @llvm.amdgcn.readfirstlane(i32)
+define i32 @bool(i32 %off) {
+; CHECK-LABEL: @bool(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[OFF:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.readfirstlane.i1(i1 [[CMP]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP1]], i64 32, i64 0
+; CHECK-NEXT:    [[PTR_OFS:%.*]] = getelementptr i32, ptr addrspace(4) null, i64 [[SEL]]
+; CHECK-NEXT:    [[DESC:%.*]] = load i32, ptr addrspace(4) [[PTR_OFS]], align 16
+; CHECK-NEXT:    [[RF:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[DESC]])
+; CHECK-NEXT:    ret i32 [[RF]]
+;
+  %cmp = icmp ugt i32 %off, 1
+  %sel = select i1 %cmp, i64 32, i64 0
+  %ptr.ofs = getelementptr i32, ptr addrspace(4) null, i64 %sel
+  %desc = load i32, ptr addrspace(4) %ptr.ofs, align 16
+  %rf = call i32 @llvm.amdgcn.readfirstlane(i32 %desc)
+  ret i32 %rf
+}
+
+; Function Attrs: convergent nocallback nofree nounwind willreturn memory(none)
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
+
+; Function Attrs: convergent nocallback nofree nounwind willreturn memory(none)
+declare <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32>)
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
index 4ffc869bff..f696a34067 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
@@ -31,7 +31,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP28]], align 4, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32{{(\.v8i32)?}}(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
@@ -40,7 +40,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP26]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]])
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32{{(\.v8i32)?}}(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 .entry:
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
index acd93c3ad6..9b1a7c31cd 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
@@ -56,7 +56,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
index e6fc0b8e0a..d52d45edb8 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
@@ -52,7 +52,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP12]], <4 x i32> [[I9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP13]], <8 x i32> [[I10]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP13]], <4 x i32> [[I9]])
-; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP14]], <4 x i32> [[TMP15]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP14]], <4 x i32> [[TMP15]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP13]], <4 x float> [[TMP16]])
 ; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
index 300fa99474..ff849485d3 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
@@ -50,7 +50,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    [[I12:%.*]] = fadd <4 x float> [[PHI_IMG]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
index 40cd05470a..1ad1e8cf4c 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
@@ -49,7 +49,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP17]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP14]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP19:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP14]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP13]], <4 x float> [[TMP19]])
 ; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
index 77614a9a16..b86e284665 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
@@ -51,7 +51,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    [[I12:%.*]] = fadd <4 x float> [[PHI_IMG]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
@@ -65,7 +65,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP21]], <8 x i32> [[TMP28]])
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[I12]], i32 15, i32 1, <8 x i32> [[TMP29]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32{{(\.v8i32)?}}(<4 x float> [[I12]], i32 15, i32 1, <8 x i32> [[TMP29]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 .entry:
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
index 536bfbdf55..f45f4e1820 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
@@ -50,7 +50,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    [[I12:%.*]] = fadd <4 x float> [[PHI_IMG]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i32>, ptr addrspace(4) [[I8]], align 4, !invariant.load [[META10]]
@@ -60,7 +60,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP21]], <8 x i32> [[TMP28]])
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[I12]], i32 15, i32 1, <8 x i32> [[TMP29]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32{{(\.v8i32)?}}(<4 x float> [[I12]], i32 15, i32 1, <8 x i32> [[TMP29]], i32 0, i32 0)
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
index 959f5fe58f..aee46c2951 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
@@ -48,7 +48,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP16]], align 4, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP13]], <8 x i32> [[TMP18]])
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP12]], i32 15, i32 1, <8 x i32> [[TMP19]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32{{(\.v8i32)?}}(<4 x float> [[TMP12]], i32 15, i32 1, <8 x i32> [[TMP19]], i32 0, i32 0)
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
index 1a89e812f6..db32092deb 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
@@ -32,7 +32,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP19]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP8]], <8 x i32> [[TMP12]])
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP7]], i32 15, i32 1, <8 x i32> [[TMP13]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32{{(\.v8i32)?}}(<4 x float> [[TMP7]], i32 15, i32 1, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 .entry:
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
index 176dca5ce4..26cc80c901 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
@@ -32,7 +32,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 4, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP13]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP13]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
index 1d568c02d8..269dc0163d 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
@@ -38,7 +38,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP22]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP11]], <4 x i32> [[TMP9]])
-; CHECK-NEXT:    [[TMP17:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP15]], <4 x i32> [[TMP16]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP17:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP15]], <4 x i32> [[TMP16]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP18:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP11]], <4 x float> [[TMP17]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
index 81b2717e12..3e6944dcd9 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
@@ -34,7 +34,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP28]], align 4, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32{{(\.v8i32)?}}(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
@@ -43,7 +43,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP26]], align 4, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]])
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32{{(\.v8i32)?}}(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
 ; CHECK-NEXT:    br label [[RET]]
 ; CHECK:       ret:
 ; CHECK-NEXT:    ret void
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
index c934bb95e1..9c6d5c8f0c 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
@@ -37,7 +37,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP17]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP17]], i32 -1) ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META24:![0-9]+]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.image.load.1d.v4i32.i32(i32 15, i32 [[DOT0]], <8 x i32> [[TMP19]], i32 0, i32 0), !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.image.load.1d.v4i32.i32{{(\.v8i32)?}}(i32 15, i32 [[DOT0]], <8 x i32> [[TMP19]], i32 0, i32 0), !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @lgc.load.user.data__i32(i32 36)
 ; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP22]], i64 0
@@ -61,7 +61,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP40]], align 4, !invariant.load [[META24]]
-; CHECK-NEXT:    [[TMP42:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP41]], <4 x i32> [[TMP36]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP42:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP41]], <4 x i32> [[TMP36]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP43:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP37]], <4 x float> [[TMP42]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = mul i32 [[TMP7]], 32
 ; CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP44]] to i64
@@ -73,7 +73,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP49]] to i64
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP50]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP51]], align 4, !invariant.load [[META24]]
-; CHECK-NEXT:    [[TMP53:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP67]], <4 x i32> [[TMP59]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP53:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP67]], <4 x i32> [[TMP59]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP54:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP48]], <4 x float> [[TMP53]])
 ; CHECK-NEXT:    [[TMP68:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP29]], align 4, !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 4, !invariant.load [[META24]]
@@ -82,7 +82,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP57:%.*]] = sext i32 [[TMP56]] to i64
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP57]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP58]], align 4, !invariant.load [[META24]]
-; CHECK-NEXT:    [[TMP60:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP70]], <4 x i32> [[TMP69]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP60:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP70]], <4 x i32> [[TMP69]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP61:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP55]], <4 x float> [[TMP60]])
 ; CHECK-NEXT:    [[TMP62]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[DOT09]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[TMP43]], [[TMP54]]
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
index 37f97ccd90..1da3d84680 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
@@ -48,7 +48,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I13:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I13]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    ret void
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
index cd823cdc5b..fd6abb50e6 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
@@ -53,7 +53,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    ret void
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
index dbe9969ddc..1b616ebf60 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
@@ -49,7 +49,7 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 4, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
diff --git a/lgc/unittests/CMakeLists.txt b/lgc/unittests/CMakeLists.txt
index ded3d3240b..603b5e99e1 100644
--- a/lgc/unittests/CMakeLists.txt
+++ b/lgc/unittests/CMakeLists.txt
@@ -35,6 +35,7 @@ function(add_lgc_unittest test_dirname)
 endfunction()
 
 add_subdirectory(interface)
+add_subdirectory(internal)
 
 # Add a LIT target to execute all unit tests.
 # Required by lit.site.cfg.py.in.
@@ -54,4 +55,5 @@ add_lit_testsuite(check-lgc-units "Running the LGC unit tests"
   ${exclude_from_check_all}
   DEPENDS
     LgcUnitTests
+    LgcInternalTests
 )
diff --git a/lgc/unittests/internal/CMakeLists.txt b/lgc/unittests/internal/CMakeLists.txt
new file mode 100644
index 0000000000..49bb249584
--- /dev/null
+++ b/lgc/unittests/internal/CMakeLists.txt
@@ -0,0 +1,37 @@
+##
+ #######################################################################################################################
+ #
+ #  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ #
+ #  Permission is hereby granted, free of charge, to any person obtaining a copy
+ #  of this software and associated documentation files (the "Software"), to
+ #  deal in the Software without restriction, including without limitation the
+ #  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ #  sell copies of the Software, and to permit persons to whom the Software is
+ #  furnished to do so, subject to the following conditions:
+ #
+ #  The above copyright notice and this permission notice shall be included in all
+ #  copies or substantial portions of the Software.
+ #
+ #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ #  IN THE SOFTWARE.
+ #
+ #######################################################################################################################
+
+add_lgc_unittest(LgcInternalTests
+  MsgPackScannerTest.cpp
+)
+
+target_link_libraries(LgcInternalTests PRIVATE
+  LLVMCore
+  LLVMlgc
+)
+
+target_include_directories(LgcInternalTests PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+)
diff --git a/lgc/unittests/internal/MsgPackScannerTest.cpp b/lgc/unittests/internal/MsgPackScannerTest.cpp
new file mode 100644
index 0000000000..3496b3bf0a
--- /dev/null
+++ b/lgc/unittests/internal/MsgPackScannerTest.cpp
@@ -0,0 +1,272 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#include "lgc/util/MsgPackScanner.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "gmock/gmock.h"
+
+using namespace lgc;
+using namespace llvm;
+
+TEST(MsgPackScanner, TestReadUpdateInt) {
+  static const struct { MsgPackScanner::Item top = {MsgPackScanner::ItemType::Scalar}; } spec;
+  MsgPackScanner::Spec scannerSpec(&spec);
+  MsgPackScanner scanner(scannerSpec);
+  StringRef blob = StringRef("\xd0\x2a", 2);
+  Error err = scanner.scan(blob);
+  ASSERT_FALSE(err);
+  ASSERT_EQ(scanner.asInt(spec.top), 0x2a);
+  // Update the top item.
+  scanner.set(spec.top, 0x12a);
+  ASSERT_EQ(scanner.asInt(spec.top), 0x12a);
+  // Write the updated MsgPack.
+  SmallString<0> output;
+  raw_svector_ostream stream(output);
+  scanner.write(stream);
+  ASSERT_EQ(output, "\xcd\x01\x2a");
+}
+
+TEST(MsgPackScanner, TestReadBinary) {
+  static const struct { MsgPackScanner::Item top = {MsgPackScanner::ItemType::Scalar}; } spec;
+  MsgPackScanner::Spec scannerSpec(&spec);
+  MsgPackScanner scanner(scannerSpec);
+  StringRef blob = StringRef("\xC4\x4\x1\x2\x3\x4", 6);
+  Error err = scanner.scan(blob);
+  ASSERT_FALSE(err);
+  ASSERT_EQ(scanner.asString(spec.top), "\x1\x2\x3\x4");
+}
+
+TEST(MsgPackScanner, TestReadUpdateArray) {
+  // clang-format off
+  static const struct {
+    MsgPackScanner::Item top = {MsgPackScanner::ItemType::Array};
+    MsgPackScanner::Item   element0 = {MsgPackScanner::ItemType::Scalar};
+    MsgPackScanner::Item   element1 = {MsgPackScanner::ItemType::Scalar};
+    MsgPackScanner::Item   element2 = {MsgPackScanner::ItemType::Scalar};
+    MsgPackScanner::Item end = {MsgPackScanner::ItemType::EndContainer};
+  } spec;
+  // clang-format on
+  MsgPackScanner::Spec scannerSpec(&spec);
+  MsgPackScanner scanner(scannerSpec);
+  StringRef blob = StringRef("\x92\x2b\x2c");
+  Error err = scanner.scan(blob);
+  ASSERT_FALSE(err);
+  ASSERT_EQ(scanner.asInt(spec.element0), 0x2b);
+  ASSERT_EQ(scanner.asInt(spec.element1), 0x2c);
+  ASSERT_FALSE(scanner.isSet(spec.element2));
+  // Update element 0.
+  scanner.set(spec.element0, 0x12b);
+  ASSERT_EQ(scanner.asInt(spec.element0), 0x12b);
+  // Update element 2. This was not present before, so it extends the array.
+  scanner.set(spec.element2, 0x22b);
+  ASSERT_EQ(scanner.asInt(spec.element2), 0x22b);
+  // Write the updated MsgPack.
+  SmallString<0> output;
+  raw_svector_ostream stream(output);
+  scanner.write(stream);
+  ASSERT_EQ(output, "\x93\xcd\x01\x2b\x2c\xcd\x02\x2b");
+}
+
+TEST(MsgPackScanner, TestReadUpdateMap) {
+  // clang-format off
+  static const struct {
+    MsgPackScanner::Item top = {MsgPackScanner::ItemType::Map};
+    MsgPackScanner::Item   bar = {MsgPackScanner::ItemType::Scalar, "bar"};
+    MsgPackScanner::Item   cad = {MsgPackScanner::ItemType::Scalar, "cad"};
+    MsgPackScanner::Item   foo = {MsgPackScanner::ItemType::Scalar, "foo"};
+    MsgPackScanner::Item end = {MsgPackScanner::ItemType::EndContainer};
+  } spec;
+  // clang-format on
+  MsgPackScanner::Spec scannerSpec(&spec);
+  MsgPackScanner scanner(scannerSpec);
+  StringRef blob = StringRef("\x82\xa3"
+                             "foo"
+                             "\xd0\x2d\xa3"
+                             "bar"
+                             "\xd0\x2e");
+  Error err = scanner.scan(blob);
+  ASSERT_FALSE(err);
+  ASSERT_EQ(scanner.asInt(spec.bar), 0x2e);
+  ASSERT_EQ(scanner.asInt(spec.foo), 0x2d);
+  ASSERT_FALSE(scanner.isSet(spec.cad));
+  // Update foo.
+  scanner.set(spec.foo, 0x12d);
+  // Set cad. This was not present before, so it extends the map.
+  scanner.set(spec.cad, "wibble");
+  // Write the updated MsgPack.
+  SmallString<0> output;
+  raw_svector_ostream stream(output);
+  scanner.write(stream);
+  ASSERT_EQ(output, StringRef("\x83\xa3"
+                              "foo"
+                              "\xcd\x01\x2d\xa3"
+                              "bar"
+                              "\xd0\x2e\xa3"
+                              "cad"
+                              "\xa6"
+                              "wibble"));
+}
+
+TEST(MsgPackScanner, TestNestedCreateMapFromEmpty) {
+  // clang-format off
+  static const struct {
+    MsgPackScanner::Item top = {MsgPackScanner::ItemType::Map};
+    MsgPackScanner::Item   bar = {MsgPackScanner::ItemType::Scalar, "bar"};
+    MsgPackScanner::Item   map2 = {MsgPackScanner::ItemType::Map, "map2"};
+    MsgPackScanner::Item     foo = {MsgPackScanner::ItemType::Scalar, "foo"};
+    MsgPackScanner::Item     map3 = {MsgPackScanner::ItemType::Map, "map3"};
+    MsgPackScanner::Item       cat = {MsgPackScanner::ItemType::Scalar, "cat"};
+    MsgPackScanner::Item     endMap3 = {MsgPackScanner::ItemType::EndContainer};
+    MsgPackScanner::Item   endMap2 = {MsgPackScanner::ItemType::EndContainer};
+    MsgPackScanner::Item end = {MsgPackScanner::ItemType::EndContainer};
+  } spec;
+  // clang-format on
+  MsgPackScanner::Spec scannerSpec(&spec);
+  MsgPackScanner scanner(scannerSpec);
+  // Set "cat", resulting in the creation of all three levels of map.
+  scanner.set(spec.cat, "mouse");
+  ASSERT_EQ(scanner.asString(spec.cat), StringRef("mouse"));
+  ASSERT_FALSE(scanner.isSet(spec.bar));
+  ASSERT_FALSE(scanner.isSet(spec.foo));
+  // Write the updated MsgPack.
+  SmallString<0> output;
+  raw_svector_ostream stream(output);
+  scanner.write(stream);
+  // Check it by parsing with msgpack::Document and converting to YAML text.
+  msgpack::Document doc;
+  doc.readFromBlob(output, /*Multi=*/false);
+  SmallString<0> yaml;
+  raw_svector_ostream yamlStream(yaml);
+  doc.toYAML(yamlStream);
+  ASSERT_EQ(yaml, StringRef("---\n"
+                            "map2:\n"
+                            "  map3:\n"
+                            "    cat:             mouse\n"
+                            "...\n"));
+}
+
+TEST(MsgPackScanner, TestNestedCreateMap) {
+  // clang-format off
+  static const struct {
+    MsgPackScanner::Item top = {MsgPackScanner::ItemType::Map};
+    MsgPackScanner::Item   bar = {MsgPackScanner::ItemType::Scalar, "bar"};
+    MsgPackScanner::Item   map2 = {MsgPackScanner::ItemType::Map, "map2"};
+    MsgPackScanner::Item     foo = {MsgPackScanner::ItemType::Scalar, "foo"};
+    MsgPackScanner::Item     map3 = {MsgPackScanner::ItemType::Map, "map3"};
+    MsgPackScanner::Item       cat = {MsgPackScanner::ItemType::Scalar, "cat"};
+    MsgPackScanner::Item     endMap3 = {MsgPackScanner::ItemType::EndContainer};
+    MsgPackScanner::Item   endMap2 = {MsgPackScanner::ItemType::EndContainer};
+    MsgPackScanner::Item end = {MsgPackScanner::ItemType::EndContainer};
+  } spec;
+  // clang-format on
+  MsgPackScanner::Spec scannerSpec(&spec);
+  // Create initial MsgPack blob using msgpack::Document to parse YAML text.
+  const char yaml[] = "---\n"
+                      "bar: barrow\n"
+                      "...\n";
+  msgpack::Document doc;
+  doc.fromYAML(yaml);
+  std::string blob;
+  doc.writeToBlob(blob);
+  // Scan blob into MsgPackScanner.
+  MsgPackScanner scanner(scannerSpec);
+  Error err = scanner.scan(blob);
+  ASSERT_FALSE(err);
+  ASSERT_EQ(scanner.asString(spec.bar), StringRef("barrow"));
+  // Set "cat", resulting in the creation of map2 and map3.
+  scanner.set(spec.cat, "mouse");
+  ASSERT_EQ(scanner.asString(spec.cat), StringRef("mouse"));
+  ASSERT_FALSE(scanner.isSet(spec.foo));
+  // Change the value of "bar", changing its size.
+  scanner.set(spec.bar, "barycentric");
+  // Write the updated MsgPack.
+  SmallString<0> output;
+  raw_svector_ostream stream(output);
+  scanner.write(stream);
+  // Check it by parsing with msgpack::Document and converting to YAML text.
+  msgpack::Document doc2;
+  doc2.readFromBlob(output, /*Multi=*/false);
+  SmallString<0> yaml2;
+  raw_svector_ostream yamlStream2(yaml2);
+  doc2.toYAML(yamlStream2);
+  ASSERT_EQ(yaml2, StringRef("---\n"
+                             "bar:             barycentric\n"
+                             "map2:\n"
+                             "  map3:\n"
+                             "    cat:             mouse\n"
+                             "...\n"));
+}
+
+TEST(MsgPackScanner, TestReduceSize) {
+  // clang-format off
+  static const struct {
+    MsgPackScanner::Item top = {MsgPackScanner::ItemType::Map};
+    MsgPackScanner::Item   map2 = {MsgPackScanner::ItemType::Map, "map2"};
+    MsgPackScanner::Item     map3 = {MsgPackScanner::ItemType::Map, "map3"};
+    MsgPackScanner::Item       cat = {MsgPackScanner::ItemType::Scalar, "cat"};
+    MsgPackScanner::Item     endMap3 = {MsgPackScanner::ItemType::EndContainer};
+    MsgPackScanner::Item   endMap2 = {MsgPackScanner::ItemType::EndContainer};
+    MsgPackScanner::Item end = {MsgPackScanner::ItemType::EndContainer};
+  } spec;
+  // clang-format on
+  MsgPackScanner::Spec scannerSpec(&spec);
+  // Create initial MsgPack blob using msgpack::Document to parse YAML text.
+  const char yaml[] = "---\n"
+                      "bar: barycentric\n"
+                      "map2:\n"
+                      "  map3:\n"
+                      "    cat: mouse\n"
+                      "  squirrel: nut\n"
+                      "...\n";
+  msgpack::Document doc;
+  doc.fromYAML(yaml);
+  std::string blob;
+  doc.writeToBlob(blob);
+  // Scan blob into MsgPackScanner.
+  MsgPackScanner scanner(scannerSpec);
+  Error err = scanner.scan(blob);
+  ASSERT_FALSE(err);
+  ASSERT_EQ(scanner.asString(spec.cat), StringRef("mouse"));
+  // Set "cat" to "ox", a shorter string.
+  scanner.set(spec.cat, "ox");
+  ASSERT_EQ(scanner.asString(spec.cat), StringRef("ox"));
+  // Write the updated MsgPack.
+  SmallString<0> output;
+  raw_svector_ostream stream(output);
+  scanner.write(stream);
+  // Check it by parsing with msgpack::Document and converting to YAML text.
+  msgpack::Document doc2;
+  doc2.readFromBlob(output, /*Multi=*/false);
+  SmallString<0> yaml2;
+  raw_svector_ostream yamlStream2(yaml2);
+  doc2.toYAML(yamlStream2);
+  ASSERT_EQ(yaml2, StringRef("---\n"
+                             "bar:             barycentric\n"
+                             "map2:\n"
+                             "  map3:\n"
+                             "    cat:             ox\n"
+                             "  squirrel:        nut\n"
+                             "...\n"));
+}
diff --git a/lgc/util/Internal.cpp b/lgc/util/Internal.cpp
index e7a94d8bdb..327b1b3e1c 100644
--- a/lgc/util/Internal.cpp
+++ b/lgc/util/Internal.cpp
@@ -28,6 +28,7 @@
  * @brief LLPC source file: contains implementation of LLPC internal-use utility functions.
  ***********************************************************************************************************************
  */
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/raw_os_ostream.h"
@@ -205,6 +206,14 @@ bool canBitCast(const Type *ty1, const Type *ty2) {
   return valid;
 }
 
+// Checks if the type is supported on amdgcn_readfirstlane in the backend.
+//
+// @param ty : Type to check
+bool isReadFirstLaneTypeSupported(const llvm::Type *ty) {
+  return ty->isVectorTy() ? cast<VectorType>(ty)->getElementType()->isIntegerTy(32)
+                          : ty->isFloatTy() || ty->isIntegerTy(32) || ty->isIntegerTy(1);
+}
+
 // =====================================================================================================================
 // Checks if the specified value actually represents a don't-care value (0xFFFFFFFF).
 //
@@ -238,7 +247,7 @@ Type *getVgprTy(Type *ty) {
 // =====================================================================================================================
 // Helper function to create LLVM Function and update NewDbgInfoFormat flag
 llvm::Function *createFunctionHelper(llvm::FunctionType *ty, llvm::GlobalValue::LinkageTypes linkage,
-                                     llvm::Module *module, const llvm::Twine &name) {
+                                     llvm::Module *module, bool createDbgInfo, const llvm::Twine &name) {
 
   llvm::Function *func = Function::Create(ty, linkage, name);
 
@@ -246,7 +255,31 @@ llvm::Function *createFunctionHelper(llvm::FunctionType *ty, llvm::GlobalValue::
   func->setIsNewDbgInfoFormat(module->IsNewDbgInfoFormat);
 #endif
 
+  if (createDbgInfo) {
+    DIBuilder debugBuilder(*module);
+    DIFile *fileContext = debugBuilder.createFile("internal", "");
+    debugBuilder.createCompileUnit(dwarf::DW_LANG_C99, fileContext, "lgc", false, StringRef(), 0);
+    DISubprogram *funcContext =
+        debugBuilder.createFunction(fileContext, func->getName(), StringRef(), fileContext, 0,
+                                    debugBuilder.createSubroutineType(debugBuilder.getOrCreateTypeArray({})), 0,
+                                    DINode::DIFlags::FlagArtificial, DISubprogram::SPFlagDefinition);
+    func->setSubprogram(funcContext);
+  }
+
   return func;
 }
 
+// =====================================================================================================================
+// Helper function to call LLVM Function and set debug location
+llvm::CallInst *callFunctionHelper(llvm::Function *func, llvm::ArrayRef<llvm::Value *> args,
+                                   llvm::BasicBlock *insertAtEnd) {
+  BuilderBase builder(insertAtEnd);
+  CallInst *call = builder.CreateCall(func, args);
+  if (func->getSubprogram()) {
+    DISubprogram *SP = call->getParent()->getParent()->getSubprogram();
+    DILocation *loc = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
+    call->setDebugLoc(loc);
+  }
+  return call;
+}
 } // namespace lgc
diff --git a/lgc/util/MsgPackScanner.cpp b/lgc/util/MsgPackScanner.cpp
new file mode 100644
index 0000000000..02fcb9e05f
--- /dev/null
+++ b/lgc/util/MsgPackScanner.cpp
@@ -0,0 +1,675 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// MsgPackScanner class to read, write and incrementally update MsgPack.
+
+#include "lgc/util/MsgPackScanner.h"
+#include "llvm/BinaryFormat/MsgPack.h"
+#include "llvm/BinaryFormat/MsgPackReader.h"
+#include "llvm/BinaryFormat/MsgPackWriter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+
+#define DEBUG_TYPE "msgpack-scanner"
+
+using namespace lgc;
+using namespace llvm;
+using namespace llvm::msgpack;
+using namespace llvm::support;
+
+// =====================================================================================================================
+// Generate a run-time 32-bit hash of the specified string using the FNV-1a hash algorithm.
+static uint32_t fnv1aHash(llvm::StringRef str) {
+  // FNV-1a hash offset
+  static constexpr uint32_t Fnv1aOffset = 2166136261u;
+  // FNV-1a hash prime
+  static constexpr uint32_t Fnv1aPrime = 16777619u;
+
+  uint32_t hash = Fnv1aOffset;
+  for (char c : str) {
+    hash ^= uint8_t(c);
+    hash *= Fnv1aPrime;
+  }
+  return hash;
+}
+
+// =====================================================================================================================
+// Spec object constructor given reference to caller's struct containing Items. The supplied struct must remain valid
+// for the lifetime of the MsgPackScanner::Spec, which must remain valid for the lifetime of any
+// MsgPackScanner using it.
+MsgPackScanner::Spec::Spec(const void *itemStruct) {
+  // Scan to find the size of the array by looking at nested maps and arrays.
+  const Item *items = static_cast<const Item *>(itemStruct);
+  unsigned level = 0;
+  for (unsigned idx = 0;; ++idx) {
+    const Item &item = items[idx];
+    assert(item.itemType >= ItemType::First && item.itemType <= ItemType::Last);
+    if (item.itemType == ItemType::EndContainer) {
+      if (--level == 0) {
+        m_itemArray = ArrayRef<Item>(items, idx + 1);
+        break;
+      }
+      continue;
+    }
+    if (item.itemType == ItemType::Array || item.itemType == ItemType::Map) {
+      ++level;
+      continue;
+    }
+    if (level == 0) {
+      // Spec has only one item if it is not map or array.
+      assert(idx == 0);
+      m_itemArray = ArrayRef<Item>(items, 1);
+      break;
+    }
+  }
+
+  m_parentIndices.resize(m_itemArray.size());
+  m_parentIndices[0] = UINT_MAX;
+  // Add items to map so they can be found when scanning msgpack.
+  struct StackLevel {
+    unsigned itemIndex;
+    bool isMap;
+    unsigned childIndex;
+  };
+  SmallVector<StackLevel> stack;
+  stack.push_back({});
+  for (unsigned itemIndex = 1; itemIndex != m_itemArray.size(); ++itemIndex) {
+    const Item &item = m_itemArray[itemIndex];
+    m_parentIndices[itemIndex] = stack.back().itemIndex;
+    if (item.itemType != ItemType::EndContainer) {
+      if (item.name) {
+        // Item has a map key.
+        bool inserted = m_itemMap.insert({{fnv1aHash(item.name), stack.back().itemIndex}, itemIndex}).second;
+        assert(inserted && "Duplicate name at this level in MsgPackScanner spec");
+        (void)inserted;
+        LLVM_DEBUG(dbgs() << "Item " << itemIndex << " is name " << item.name << " parent index "
+                          << stack.back().itemIndex << "\n");
+      } else {
+        // No map key; make up our own array index.
+        m_itemMap.insert({{stack.back().childIndex, stack.back().itemIndex}, itemIndex});
+        LLVM_DEBUG(dbgs() << "Item " << itemIndex << " is name " << stack.back().childIndex << " parent index "
+                          << stack.back().itemIndex << "\n");
+      }
+    }
+    // Only increment childIndex for an array. Anonymous map entry items always get index 0, meaning that we
+    // can map multiple map entries against it.
+    if (!stack.back().isMap)
+      ++stack.back().childIndex;
+    if (item.itemType == ItemType::Map || item.itemType == ItemType::Array)
+      stack.push_back({itemIndex});
+    else if (item.itemType == ItemType::EndContainer) {
+      stack.pop_back();
+      assert((!stack.empty() || itemIndex == m_itemArray.size() - 1) && "Bad MsgPackScanner spec");
+    }
+  }
+}
+
+// =====================================================================================================================
+// Look up a {key, parent item index}, giving an item index. Key is one of:
+// - FNV-1a hash of name for map; or
+// - 0 for anonymous map entry; or
+// - index for array entry.
+std::optional<unsigned> MsgPackScanner::Spec::lookup(unsigned key, unsigned parentItemIndex) const {
+  auto it = m_itemMap.find({key, parentItemIndex});
+  if (it == m_itemMap.end())
+    return {};
+  return it->second;
+}
+
+// =====================================================================================================================
+// Constructor given Spec object.
+MsgPackScanner::MsgPackScanner(const Spec &spec) : m_spec(spec) {
+  m_itemInfos.resize(m_spec.size());
+}
+
+// =====================================================================================================================
+// Scan a MsgPack blob. Returns error for illegal MsgPack format.
+// The callback is called just after finding an item in the item array, allowing the caller to accumulate
+// a value from an item that occurs multiple times (typically as a named child of a map where the map is specified
+// as an anonymous child of an outer map).
+Error MsgPackScanner::scan(StringRef blob, function_ref<Error(MsgPackScanner &, const Item &)> callback) {
+#ifndef NDEBUG
+  assert(!m_inUse);
+  m_inUse = true;
+#endif
+  m_blob = blob;
+
+  // The top of stack StackLevel represents:
+  // - an in-progress skipping array or skipping or non-skipping map, if remaining is non-zero;
+  // - otherwise, an item about to be filled in.
+  struct StackLevel {
+    unsigned itemIndex;
+    size_t childCount;
+    bool isMap;
+    size_t childIndex;
+  };
+  SmallVector<StackLevel> stack;
+  stack.push_back({UINT_MAX, 1});
+
+  unsigned itemIndex = 0; // First object always attached to item index 0 in spec
+  unsigned objectSize = 0;
+  m_next = 0;
+  while (!stack.empty()) {
+    m_next += objectSize;
+    if (m_next == m_blob.size())
+      return make_error<StringError>("MsgPack truncated", std::make_error_code(std::errc::invalid_argument));
+    // Get size of next object.
+    Expected<unsigned> objectSizeOr = getObjectSize();
+    if (Error err = objectSizeOr.takeError())
+      return err;
+    objectSize = *objectSizeOr;
+    // Get next object.
+    Object obj;
+    {
+      Reader reader(m_blob.drop_front(m_next));
+      if (Error err = reader.read(obj).takeError())
+        return err;
+    }
+
+    LLVM_DEBUG({
+      for (unsigned i = 0; i != stack.size(); ++i)
+        dbgs() << "  ";
+      dbgs() << m_next << ": ";
+      switch (obj.Kind) {
+      case Type::Int:
+        dbgs() << "int " << obj.Int;
+        break;
+      case Type::UInt:
+        dbgs() << "uint " << obj.UInt;
+        break;
+      case Type::Nil:
+        dbgs() << "nil";
+        break;
+      case Type::Boolean:
+        dbgs() << "boolean " << obj.Bool;
+        break;
+      case Type::Float:
+        dbgs() << "float " << obj.Float;
+        break;
+      case Type::String:
+        dbgs() << "string " << obj.Raw;
+        break;
+      case Type::Binary:
+        dbgs() << "binary " << obj.Raw;
+        break;
+      case Type::Array:
+        dbgs() << "array " << obj.Length;
+        break;
+      case Type::Map:
+        dbgs() << "map " << obj.Length;
+        break;
+      case Type::Extension:
+        dbgs() << "extension " << obj.Extension.Type << " " << obj.Extension.Bytes;
+        break;
+      default:
+        dbgs() << "unknown";
+        break;
+      }
+    });
+
+    if (itemIndex != UINT_MAX) {
+      // itemIndex is already set, either because this is the first time round the loop (the first object is
+      // always attached to item index 0), or because the previous loop read a map key that matched one we
+      // are looking for, so we are now on the value for that key.
+    } else if (stack.back().itemIndex != UINT_MAX) {
+      // Set itemIndex to the item matching the current object in the parent map or array being scanned.
+      if (stack.back().isMap) {
+        // Check for this object being the key in a map. (A map has Length*2 children, where, counting from 0,
+        // the even numbered ones are the keys and the odd numbered ones are the values.)
+        if (stack.back().childIndex % 2 == 0) {
+          if (obj.Kind == Type::String) {
+            unsigned key = fnv1aHash(obj.Raw);
+            LLVM_DEBUG(dbgs() << " (checking name " << key << " parent " << stack.back().itemIndex << ")");
+            std::optional<unsigned> found = m_spec.lookup(key, stack.back().itemIndex);
+            if (found) {
+              itemIndex = found.value();
+              m_itemInfos[itemIndex].keyOffset = m_next;
+              ++stack.back().childIndex;
+              LLVM_DEBUG(dbgs() << ": key for item " << itemIndex << "\n");
+              continue; // Loop back for the value corresponding to this key.
+            }
+          }
+          // Check for a match for an anonymous item in a map.
+          LLVM_DEBUG(dbgs() << " (checking name 0 parent " << stack.back().itemIndex << ")");
+          std::optional<unsigned> found = m_spec.lookup(0, stack.back().itemIndex);
+          if (found) {
+            itemIndex = found.value();
+            m_itemInfos[itemIndex].keyOffset = m_next;
+            ++stack.back().childIndex;
+            LLVM_DEBUG(dbgs() << ": key for item " << itemIndex << "\n");
+            continue; // Loop back for the value corresponding to this key.
+          }
+        }
+      }
+      if (!stack.back().isMap) {
+        // Check for this object being an array element.
+        unsigned key = stack.back().childIndex;
+        LLVM_DEBUG(dbgs() << " (checking name " << key << " parent " << stack.back().itemIndex << ")");
+        std::optional<unsigned> found = m_spec.lookup(key, stack.back().itemIndex);
+        if (found)
+          itemIndex = found.value();
+      }
+    }
+
+    if (itemIndex != UINT_MAX) {
+      // This object is being attached to an item in the spec.
+      m_itemInfos[itemIndex].offset = m_next;
+      m_itemInfos[itemIndex].size = objectSize;
+      LLVM_DEBUG(dbgs() << ": item " << itemIndex);
+      if (itemIndex != UINT_MAX && callback) {
+        if (Error err = callback(*this, m_spec[itemIndex]))
+          return err;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    if (obj.Kind == Type::Map && obj.Length != 0) {
+      // Start a new map. It has Length {key,value} pairs of entries, thus Length*2 entries.
+      stack.push_back({itemIndex, obj.Length * 2, /*isMap=*/true});
+    } else if (obj.Kind == Type::Array && obj.Length != 0) {
+      // Start a new array.
+      stack.push_back({itemIndex, obj.Length, /*isMap=*/false});
+    } else {
+      // Increment count on current container; pop if at end.
+      while (++stack.back().childIndex == stack.back().childCount) {
+        unsigned poppingItemIndex = stack.back().itemIndex;
+        if (poppingItemIndex != UINT_MAX)
+          m_itemInfos[poppingItemIndex].endOffset = m_next + objectSize;
+        stack.pop_back();
+        LLVM_DEBUG({
+          for (unsigned i = 0; i != stack.size(); ++i)
+            dbgs() << "  ";
+          dbgs() << "pop\n";
+        });
+        if (stack.empty())
+          break;
+      }
+    }
+    itemIndex = UINT_MAX;
+  }
+  LLVM_DEBUG(dbgs() << "Finished msgpack scan\n");
+  return Error::success();
+}
+
+// =====================================================================================================================
+// Get size of next object.
+// We use MsgPackReader to read the next object, but it does not tell us how big the object is, so we have
+// to figure that out for ourselves. For an array or map, the object size does not include the enclosed
+// elements.
+// If we upstream this code into LLVM, then we could instead add a public method to MsgPackReader to get
+// its next pointer.
+Expected<unsigned> MsgPackScanner::getObjectSize() const {
+  unsigned firstByte = uint8_t(m_blob[m_next]);
+  switch (firstByte) {
+
+  case FirstByte::Int8:
+  case FirstByte::UInt8:
+    return 1 + sizeof(int8_t);
+
+  case FirstByte::Int16:
+  case FirstByte::UInt16:
+    return 1 + sizeof(int16_t);
+
+  case FirstByte::Int32:
+  case FirstByte::UInt32:
+    return 1 + sizeof(int32_t);
+
+  case FirstByte::Int64:
+  case FirstByte::UInt64:
+    return 1 + sizeof(int64_t);
+
+  case FirstByte::Float32:
+    return 1 + sizeof(float);
+
+  case FirstByte::Float64:
+    return 1 + sizeof(double);
+
+  case FirstByte::Str8:
+  case FirstByte::Bin8:
+    if (m_next + 1 + sizeof(uint8_t) > m_blob.size())
+      return make_error<StringError>("MsgPack truncated", std::make_error_code(std::errc::invalid_argument));
+    return 1 + sizeof(uint8_t) + endian::read<uint8_t, Endianness>(m_blob.data() + m_next + 1);
+
+  case FirstByte::Str16:
+  case FirstByte::Bin16:
+    if (m_next + 1 + sizeof(uint16_t) > m_blob.size())
+      return make_error<StringError>("MsgPack truncated", std::make_error_code(std::errc::invalid_argument));
+    return 1 + sizeof(uint16_t) + endian::read<uint16_t, Endianness>(m_blob.data() + m_next + 1);
+
+  case FirstByte::Str32:
+  case FirstByte::Bin32:
+    if (m_next + 1 + sizeof(uint32_t) > m_blob.size())
+      return make_error<StringError>("MsgPack truncated", std::make_error_code(std::errc::invalid_argument));
+    return 1 + sizeof(uint32_t) + endian::read<uint32_t, Endianness>(m_blob.data() + m_next + 1);
+
+  case FirstByte::Array16:
+  case FirstByte::Map16:
+    return 1 + sizeof(uint16_t);
+
+  case FirstByte::Array32:
+  case FirstByte::Map32:
+    return 1 + sizeof(uint32_t);
+
+  case FirstByte::FixExt1:
+    return 1 + 1 + 1;
+
+  case FirstByte::FixExt2:
+    return 1 + 1 + 2;
+
+  case FirstByte::FixExt4:
+    return 1 + 1 + 4;
+
+  case FirstByte::FixExt8:
+    return 1 + 1 + 8;
+
+  case FirstByte::FixExt16:
+    return 1 + 1 + 16;
+
+  case FirstByte::Ext8:
+    if (m_next + 1 + sizeof(uint8_t) > m_blob.size())
+      return make_error<StringError>("MsgPack truncated", std::make_error_code(std::errc::invalid_argument));
+    return 1 + 1 + sizeof(uint8_t) + endian::read<uint8_t, Endianness>(m_blob.data() + m_next + 1 + 1);
+
+  case FirstByte::Ext16:
+    if (m_next + 1 + sizeof(uint16_t) > m_blob.size())
+      return make_error<StringError>("MsgPack truncated", std::make_error_code(std::errc::invalid_argument));
+    return 1 + 1 + sizeof(uint16_t) + endian::read<uint16_t, Endianness>(m_blob.data() + m_next + 1 + 1);
+
+  case FirstByte::Ext32:
+    if (m_next + 1 + sizeof(uint32_t) > m_blob.size())
+      return make_error<StringError>("MsgPack truncated", std::make_error_code(std::errc::invalid_argument));
+    return 1 + 1 + sizeof(uint32_t) + endian::read<uint32_t, Endianness>(m_blob.data() + m_next + 1 + 1);
+
+  default:
+    if ((firstByte & FixBitsMask::String) == FixBits::String)
+      return 1 + firstByte & 0x1f;
+    return 1;
+  }
+}
+
+// =====================================================================================================================
+// Determine whether an item is set.
+bool MsgPackScanner::isSet(const Item &item) const {
+  return !getValue(item).empty();
+}
+
+// =====================================================================================================================
+// Get an item as a bool. Returns {} if the item has some other type, or was not found.
+std::optional<bool> MsgPackScanner::asBool(const Item &item) const {
+  StringRef value = getValue(item);
+  if (value.empty())
+    return {}; // Item not found
+
+  Object obj;
+  Reader reader(value);
+  void(bool(reader.read(obj))); // Check success, and assert on failure
+  if (obj.Kind == Type::Boolean)
+    return obj.Bool;
+  return {};
+}
+
+// =====================================================================================================================
+// Get an item as an integer. Returns {} if the item has some other type, or was not found.
+std::optional<uint64_t> MsgPackScanner::asInt(const Item &item) const {
+  StringRef value = getValue(item);
+  if (value.empty())
+    return {}; // Item not found
+
+  Object obj;
+  Reader reader(value);
+  void(bool(reader.read(obj))); // Check success, and assert on failure
+  if (obj.Kind == Type::UInt)
+    return obj.UInt;
+  if (obj.Kind == Type::Int)
+    return obj.Int;
+  return {};
+}
+
+// =====================================================================================================================
+// Get an item as a StringRef. Returns {} if the item has some other type, or was not found.
+std::optional<StringRef> MsgPackScanner::asString(const Item &item) const {
+  StringRef value = getValue(item);
+  if (value.empty())
+    return {}; // Item not found
+
+  Object obj;
+  Reader reader(value);
+  void(bool(reader.read(obj))); // Check success, and assert on failure
+  if (obj.Kind == Type::String || obj.Kind == Type::Binary)
+    return obj.Raw;
+  return {};
+}
+
+// =====================================================================================================================
+// Get an item's MsgPack-encoded value:
+// - if it has been set(), gives the location in m_newData;
+// - if it has not been set() but it is in the original blob, gives the location in the MsgPack blob;
+// - otherwise, returns "".
+// The returned StringRef has a length to take it up to the end of m_newData or the supplied MsgPack blob; that is
+// OK because we know it is a well-formed MsgPack value that the caller can read.
+StringRef MsgPackScanner::getValue(const Item &item) const {
+  unsigned itemIndex = &item - &m_spec[0];
+  assert(itemIndex < m_spec.size());
+  const ItemInfo &itemInfo = m_itemInfos[itemIndex];
+  if (itemInfo.newOffset != ItemInfo::NoNewOffset)
+    return StringRef(m_newData).drop_front(itemInfo.newOffset);
+  if (itemInfo.size != 0)
+    return m_blob.drop_front(itemInfo.offset);
+  return "";
+}
+
+// =====================================================================================================================
+// Set an item as a bool. This gets a different name to avoid implicit conversions from other types to bool.
+void MsgPackScanner::setBool(const Item &item, bool value) {
+  // Write the new value into m_newData.
+  size_t newOffset = m_newData.size();
+  raw_svector_ostream stream(m_newData);
+  msgpack::Writer writer(stream);
+  writer.write(value);
+  // Attach it to the item being set.
+  setValue(item, newOffset, m_newData.size() - newOffset);
+}
+
+// =====================================================================================================================
+// Set an item as an unsigned integer.
+void MsgPackScanner::set(const Item &item, uint64_t value) {
+  // Write the new value into m_newData.
+  size_t newOffset = m_newData.size();
+  raw_svector_ostream stream(m_newData);
+  msgpack::Writer writer(stream);
+  writer.write(value);
+  // Attach it to the item being set.
+  setValue(item, newOffset, m_newData.size() - newOffset);
+}
+
+// =====================================================================================================================
+// Set an item as a string
+void MsgPackScanner::set(const Item &item, StringRef value) {
+  // Write the new value into m_newData.
+  size_t newOffset = m_newData.size();
+  raw_svector_ostream stream(m_newData);
+  msgpack::Writer writer(stream);
+  writer.write(value);
+  // Attach it to the item being set.
+  setValue(item, newOffset, m_newData.size() - newOffset);
+}
+
+// =====================================================================================================================
+// Set an item to the new value that has just been written in MsgPack format to m_newData.
+//
+// @param item : Item to attach the new value to
+// @param newOffset : Offset of new value in m_newData
+// @param newSize : Size of new value in m_newData
+//
+// @return : The old offset to use when inserting children of the item.
+//
+size_t MsgPackScanner::setValue(const Item &item, size_t newOffset, size_t newSize) {
+#ifndef NDEBUG
+  m_inUse = true;
+#endif
+  unsigned itemIndex = &item - &m_spec[0];
+  assert(itemIndex < m_spec.size());
+  ItemInfo &itemInfo = m_itemInfos[itemIndex];
+  size_t insertOffset = itemInfo.endOffset;
+  if (insertOffset == 0)
+    insertOffset = itemInfo.offset;
+  if (itemInfo.newOffset == ItemInfo::NoNewOffset) {
+    if (itemInfo.size == 0) {
+      // Item does not yet exist and needs to be created.
+      // Check the parent.
+      unsigned parentIndex = m_spec.getParentIndex(itemIndex);
+      if (parentIndex != UINT_MAX) {
+        const Item &parentItem = m_spec[parentIndex];
+        size_t parentNewOffset = m_newData.size();
+        raw_svector_ostream stream(m_newData);
+        msgpack::Writer writer(stream);
+        // Determine the new length of the parent map/array: 1 if it did not already exist, otherwise one more
+        // than its previous length.
+        unsigned length = 1;
+        StringRef parentValue = getValue(parentItem);
+        if (!parentValue.empty()) {
+          Object obj;
+          Reader reader(parentValue);
+          void(bool(reader.read(obj))); // Check success, and assert on failure
+          length = obj.Length + 1;
+        }
+        // Write the new map/array header.
+        if (parentItem.itemType == ItemType::Map)
+          writer.writeMapSize(length);
+        else
+          writer.writeArraySize(length);
+        insertOffset = setValue(parentItem, parentNewOffset, m_newData.size() - parentNewOffset);
+        // If the parent is a map, we need to write the key.
+        if (parentItem.itemType == ItemType::Map) {
+          itemInfo.newKeyOffset = m_newData.size();
+          writer.write(StringRef(item.name));
+          itemInfo.newKeySize = m_newData.size() - itemInfo.newKeyOffset;
+        }
+        itemInfo.offset = insertOffset;
+      }
+    } else {
+      // First time setting an existing item. Check if the value being set is the same as the old value.
+      if (m_blob.drop_front(itemInfo.offset).substr(0, newSize) ==
+          StringRef(m_newData).drop_front(newOffset).substr(0, newSize))
+        return insertOffset; // No change in value; nothing to do.
+    }
+  }
+  // Attach the new value to the item.
+  // m_gen goes up by 2 to allow a possible new key to use gen - 1 in write(), to ensure that the new key
+  // gets written before the new value.
+  itemInfo.newOffset = newOffset;
+  itemInfo.newSize = newSize;
+  m_gen += 2;
+  itemInfo.gen = m_gen;
+  // For the case when this is a map or array being created or having its size updated ready to insert a child,
+  // return the old offset to insert at.
+  return insertOffset;
+}
+
+// =====================================================================================================================
+// Write updated MsgPack to the stream.
+void MsgPackScanner::write(raw_ostream &stream) {
+  // Gather change records for points where data is removed, inserted or changed.
+  struct Change {
+    size_t oldOffset;
+    size_t oldSize;
+    size_t newOffset;
+    size_t newSize;
+    unsigned gen;
+  };
+  SmallVector<Change> changes;
+  for (unsigned itemIndex = 0; itemIndex != m_spec.size(); ++itemIndex) {
+    ItemInfo &itemInfo = m_itemInfos[itemIndex];
+    Change change{};
+    if (itemInfo.newSize == ItemInfo::NoReplacementNewSize) {
+      // Deleting old item without replacing it.
+      // TODO: There is no code yet to delete an item that would exercise this path. The idea is that the method to
+      // delete an item would also take care of modifying the parent map/array header to change its child count.
+      change.oldOffset = itemInfo.offset;
+      change.oldSize = itemInfo.size;
+      if (itemInfo.endOffset != 0) {
+        // This item is a map or array; delete all the contents too.
+        change.oldSize = itemInfo.endOffset - itemInfo.offset;
+      }
+      if (itemInfo.keyOffset != 0) {
+        // This item is in a map; also delete the key.
+        change.oldOffset = itemInfo.keyOffset;
+        change.oldSize += itemInfo.offset - itemInfo.keyOffset;
+      }
+    } else if (itemInfo.newOffset != ItemInfo::NoNewOffset) {
+      // Replacing or adding this item.
+      if (itemInfo.newKeyOffset != ItemInfo::NoNewOffset) {
+        // Also adding new key. Do that first.
+        change.oldOffset = itemInfo.offset;
+        change.oldSize = 0;
+        change.newOffset = itemInfo.newKeyOffset;
+        change.newSize = itemInfo.newKeySize;
+        change.gen = itemInfo.gen - 1; // Key needs to go before value
+        changes.push_back(change);
+        LLVM_DEBUG({
+          dbgs() << "Change (new key) gen=" << change.gen << " oldOffset=" << change.oldOffset
+                 << " oldSize=" << change.oldSize << " new=";
+          for (char ch : StringRef(m_newData).drop_front(change.newOffset).take_front(change.newSize))
+            dbgs() << format("%2.2x ", (unsigned char)ch);
+          dbgs() << "\n";
+        });
+      }
+      change = {};
+      change.oldOffset = itemInfo.offset;
+      change.oldSize = itemInfo.size;
+      change.newOffset = itemInfo.newOffset;
+      change.newSize = itemInfo.newSize;
+    } else {
+      continue;
+    }
+    change.gen = itemInfo.gen;
+    changes.push_back(change);
+    LLVM_DEBUG({
+      dbgs() << "Change gen=" << change.gen << " oldOffset=" << change.oldOffset << " oldSize=" << change.oldSize
+             << " new=";
+      for (char ch : StringRef(m_newData).drop_front(change.newOffset).take_front(change.newSize))
+        dbgs() << format("%2.2x ", (unsigned char)ch);
+      dbgs() << "\n";
+    });
+  }
+
+  // Sort the change records by oldOffset then gen.
+  std::sort(changes.begin(), changes.end(), [](const Change &lhs, const Change &rhs) {
+    return std::tie(lhs.oldOffset, lhs.gen) < std::tie(rhs.oldOffset, rhs.gen);
+  });
+
+  // Write the new MsgPack blob.
+  size_t oldOffset = 0;
+  for (const Change &change : changes) {
+    // Write old data up to the point of the change record.
+    stream << m_blob.take_front(change.oldOffset).drop_front(oldOffset);
+    // Skip old data being removed or replaced.
+    oldOffset = change.oldOffset + change.oldSize;
+    // Write new data.
+    stream << StringRef(m_newData).drop_front(change.newOffset).take_front(change.newSize);
+  }
+  // Write remaining old data.
+  stream << m_blob.drop_front(oldOffset);
+}
diff --git a/lgc/util/RegStackUsage.cpp b/lgc/util/RegStackUsage.cpp
new file mode 100644
index 0000000000..167e1c9c9a
--- /dev/null
+++ b/lgc/util/RegStackUsage.cpp
@@ -0,0 +1,563 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// Extraction, merging and inserting reg/stack usage in PAL metadata between different ELFs.
+// A front-end can use this to propagate register and stack usage from library ELFs up to a compute
+// shader ELF.
+
+#include "lgc/RegStackUsage.h"
+#include "lgc/state/AbiMetadata.h"
+#include "lgc/util/MsgPackScanner.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#define DEBUG_TYPE "lgc-reg-stack-usage"
+
+using namespace lgc;
+using namespace llvm;
+
+namespace {
+
+// Item spec array of PAL metadata items that we are interested in, for MsgPackScanner to use.
+// We call MsgPackScanner methods such as getAsInt() passing a reference to one of these items.
+// clang-format off
+static const struct {
+  MsgPackScanner::Item top = {MsgPackScanner::ItemType::Map};
+  MsgPackScanner::Item   pipelines = {MsgPackScanner::ItemType::Array, "amdpal.pipelines"};
+  MsgPackScanner::Item     pipeline0 = {MsgPackScanner::ItemType::Map};
+  MsgPackScanner::Item       hardwareStages = {MsgPackScanner::ItemType::Map, ".hardware_stages"};
+  MsgPackScanner::Item         cs = {MsgPackScanner::ItemType::Map, ".cs"};
+  MsgPackScanner::Item           csBackendStackSize = {MsgPackScanner::ItemType::Scalar, ".backend_stack_size"};
+  MsgPackScanner::Item           csFrontendStackSize = {MsgPackScanner::ItemType::Scalar, ".frontend_stack_size"};
+  MsgPackScanner::Item           csCpsGlobal = {MsgPackScanner::ItemType::Scalar, ".cps_global"};
+  MsgPackScanner::Item           csScratchEn = {MsgPackScanner::ItemType::Scalar, ".scratch_en"};
+  MsgPackScanner::Item           csScratchMemorySize = {MsgPackScanner::ItemType::Scalar, ".scratch_memory_size"};
+  MsgPackScanner::Item           csLdsSize = {MsgPackScanner::ItemType::Scalar, ".lds_size"};
+  MsgPackScanner::Item           csSgprCount = {MsgPackScanner::ItemType::Scalar, ".sgpr_count"};
+  MsgPackScanner::Item           csVgprCount = {MsgPackScanner::ItemType::Scalar, ".vgpr_count"};
+  MsgPackScanner::Item           csMemOrdered = {MsgPackScanner::ItemType::Scalar, ".mem_ordered"};
+  MsgPackScanner::Item         endCs = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item       endHardwareStages = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item       shaderFunctions = {MsgPackScanner::ItemType::Map, ".shader_functions"};
+  MsgPackScanner::Item         theFunc = {MsgPackScanner::ItemType::Map}; // No name, so matches all .shader_functions entries
+  MsgPackScanner::Item           funcBackendStackSize = {MsgPackScanner::ItemType::Scalar, ".backend_stack_size"};
+  MsgPackScanner::Item           funcFrontendStackSize = {MsgPackScanner::ItemType::Scalar, ".frontend_stack_size"};
+  MsgPackScanner::Item           funcStackFrameSizeInBytes = {MsgPackScanner::ItemType::Scalar, ".stack_frame_size_in_bytes"};
+  MsgPackScanner::Item           funcLdsSize = {MsgPackScanner::ItemType::Scalar, ".lds_size"};
+  MsgPackScanner::Item           funcSgprCount = {MsgPackScanner::ItemType::Scalar, ".sgpr_count"};
+  MsgPackScanner::Item           funcVgprCount = {MsgPackScanner::ItemType::Scalar, ".vgpr_count"};
+  MsgPackScanner::Item         endTheFunc = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item       endShaderFunctions = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item       shaders = {MsgPackScanner::ItemType::Map, ".shaders"};
+  MsgPackScanner::Item         compute = {MsgPackScanner::ItemType::Map, ".compute"};
+  MsgPackScanner::Item           shaderSubtype = {MsgPackScanner::ItemType::Scalar, ".shader_subtype"};
+  MsgPackScanner::Item         endCompute = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item       endShaders = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item     endPipeline0 = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item   endPipelines = {MsgPackScanner::ItemType::EndContainer};
+  MsgPackScanner::Item endTop = {MsgPackScanner::ItemType::EndContainer};
+} items;
+// clang-format on
+
+static const MsgPackScanner::Spec msgPackScannerSpec(&items);
+
+template <typename T> class OptionalPod {
+public:
+  T &value() {
+    assert(m_haveValue);
+    return m_value;
+  }
+
+  const T &value() const {
+    assert(m_haveValue);
+    return m_value;
+  }
+
+  T value_or(T other) const { return (m_haveValue ? m_value : other); }
+
+  bool hasValue() const { return m_haveValue; }
+
+  void operator=(const T &other) {
+    m_value = other;
+    m_haveValue = true;
+  }
+
+  explicit operator bool() const { return m_haveValue; }
+
+  friend raw_ostream &operator<<(raw_ostream &stream, const OptionalPod<T> t) {
+    if (t.hasValue()) {
+      return stream << t.value();
+    }
+    return stream << "no value";
+  }
+
+private:
+  T m_value = {};
+  bool m_haveValue = false;
+};
+
+// Struct for reg/stack usage.
+struct Usage {
+  unsigned maxRecursionDepth;
+  unsigned callableShaderCount;
+  unsigned backendStackSize;
+  OptionalPod<unsigned> frontendStackSize;
+  unsigned stackFrameSizeInBytes;
+  unsigned scratchMemorySize;
+  unsigned ldsSize;
+  unsigned sgprCount;
+  unsigned vgprCount;
+  bool cpsGlobal;
+  bool scratchEn;
+  bool memOrdered;
+};
+
+// =====================================================================================================================
+// Output Usage textually, for debug.
+[[maybe_unused]] static raw_ostream &operator<<(raw_ostream &stream, const Usage &usage) {
+  stream << "  maxRecursionDepth " << usage.maxRecursionDepth << "\n"
+         << "  callableShaderCount " << usage.callableShaderCount << "\n"
+         << "  backendStackSize " << usage.backendStackSize << "\n"
+         << "  frontendStackSize " << usage.frontendStackSize << "\n"
+         << "  stackFrameSizeInBytes " << usage.stackFrameSizeInBytes << "\n"
+         << "  scratchMemorySize " << usage.scratchMemorySize << "\n"
+         << "  ldsSize " << usage.ldsSize << "\n"
+         << "  sgprCount " << usage.sgprCount << "\n"
+         << "  vgprCount " << usage.vgprCount << "\n"
+         << "  cpsGlobal " << usage.cpsGlobal << "\n"
+         << "  scratchEn " << usage.scratchEn << "\n"
+         << "  memOrdered " << usage.memOrdered << "\n";
+  return stream;
+}
+
+} // anonymous namespace
+
+namespace lgc {
+
+// Class to parse reg/stack usage from PAL metadata and merge it back.
+class RegStackUsageImpl {
+public:
+  // Constructor, setting up MsgPackScanner.
+  RegStackUsageImpl() : m_msgPackScanner(msgPackScannerSpec) {}
+
+  // RegStackUsage methods that get forwarded to this class.
+  ~RegStackUsageImpl() = default;
+  RegStackUsageImpl(StringRef elfBlob, unsigned maxRecursionDepth, uint64_t rayGenUsage);
+  RegStackUsageImpl(const Module &module);
+  void writeMetadata(Module &module) const;
+  void merge(const RegStackUsageImpl &shaderUsage);
+  void finalizeAndUpdate(SmallVectorImpl<char> &elfBuffer, size_t startOffset, unsigned frontendGlobalAlignment);
+
+private:
+  // Construct from PAL metadata blob. This is only used internally for the "Re-scan the new blob to check it" code.
+  RegStackUsageImpl(StringRef palMetadata);
+
+  // Set up m_usage values by scanning PAL metadata blob.
+  void scanPalMetadata();
+
+  // Finalize usage before writing back in to the launch kernel.
+  void finalize(unsigned frontendGlobalAlignment);
+
+  // Update the ELF with supplied usage info, and rewrite the ELF. This could make the ELF a different size.
+  void updateAndWrite(const Usage &usage, SmallVectorImpl<char> &elfBuffer, size_t startOffset);
+
+  // Replace some section data in an ELF.
+  void replaceElfData(object::ObjectFile &elf, SmallVectorImpl<char> &elfBuffer, size_t startOffset, size_t dataOffset,
+                      size_t oldDataSize, StringRef newData);
+
+  MsgPackScanner m_msgPackScanner;
+  Usage m_usage = {};
+  StringRef m_elfBlob;
+  std::unique_ptr<object::ObjectFile> m_elf;
+  unsigned m_noteAlign = 0;
+  size_t m_palMetadataNoteOffset = 0;
+  StringRef m_palMetadata;
+#ifndef NDEBUG
+  bool m_finalized = false;
+#endif
+};
+
+} // namespace lgc
+
+// Metadata name for reg/stack usage. All code that reads and writes it is in this source file.
+static const char RegStackUsageMetadataName[] = "lgc.reg.stack.usage";
+
+// =====================================================================================================================
+// Forwarding methods from RegStackUsage to RegStackUsageImpl
+RegStackUsage::~RegStackUsage() = default;
+
+RegStackUsage::RegStackUsage() : m_impl(std::make_unique<RegStackUsageImpl>()) {
+}
+
+RegStackUsage::RegStackUsage(StringRef elfBlob, unsigned maxRecursionDepth, uint64_t rayGenUsage)
+    : m_impl(std::make_unique<RegStackUsageImpl>(elfBlob, maxRecursionDepth, rayGenUsage)) {
+}
+
+RegStackUsage::RegStackUsage(const Module &module) : m_impl(std::make_unique<RegStackUsageImpl>(module)) {
+}
+
+void RegStackUsage::writeMetadata(Module &module) const {
+  m_impl->writeMetadata(module);
+}
+
+void RegStackUsage::merge(const RegStackUsage &shaderUsage) {
+  m_impl->merge(*shaderUsage.m_impl);
+}
+
+void RegStackUsage::finalizeAndUpdate(SmallVectorImpl<char> &elfBuffer, size_t startOffset,
+                                      unsigned frontendGlobalAlignment) {
+  m_impl->finalizeAndUpdate(elfBuffer, startOffset, frontendGlobalAlignment);
+}
+
+// =====================================================================================================================
+// Construct from ELF blob. This reads the reg/stack usage from the ELF's PAL metadata.
+// This is passed rayGenUsage to allow for a future enhancement where frontend stack size is calculated in a
+// more sophisticated way that takes into account which shaders are reachable from which rayGens.
+//
+// @param elfBlob : The ELF blob; must remain valid for the lifetime of the RegStackUsage object
+// @param maxRecursionDepth : Max recursion depth for this shader as specified by the app; 0 for traversal
+// @param rayGenUsage : bitmap of which rayGens can reach this shader, with bit 63 covering all rayGens
+//                      beyond the first 63; 0 for traversal
+//
+RegStackUsageImpl::RegStackUsageImpl(StringRef elfBlob, unsigned maxRecursionDepth, uint64_t rayGenUsage)
+    : m_msgPackScanner(msgPackScannerSpec), m_elfBlob(elfBlob) {
+  m_usage.maxRecursionDepth = maxRecursionDepth;
+
+  m_elf = cantFail(object::ObjectFile::createELFObjectFile(MemoryBufferRef(elfBlob, "")));
+  for (const object::SectionRef &section : m_elf->sections()) {
+    object::ELFSectionRef elfSection(section);
+    if (elfSection.getType() == ELF::SHT_NOTE) {
+      // This is a .note section. Find the PAL metadata note.
+      Error err = ErrorSuccess();
+      auto &elfFile = cast<object::ELFObjectFile<object::ELF64LE>>(&*m_elf)->getELFFile();
+      auto shdr = cantFail(elfFile.getSection(elfSection.getIndex()));
+      for (auto note : elfFile.notes(*shdr, err)) {
+        if (note.getName() == Util::Abi::AmdGpuArchName && note.getType() == ELF::NT_AMDGPU_METADATA) {
+          // Found the PAL metadata note record. Remember its position (in a sneaky way to get around Elf_Note_Impl
+          // hiding some details).
+          m_palMetadataNoteOffset = note.getName().data() - m_elfBlob.data() -
+                                    sizeof(object::Elf_Nhdr_Impl<object::ELFType<llvm::endianness::little, true>>);
+          m_noteAlign = shdr->sh_addralign;
+          ArrayRef<uint8_t> desc = note.getDesc(m_noteAlign);
+          // Scan the PAL metadata.
+          m_palMetadata = StringRef(reinterpret_cast<const char *>(desc.data()), desc.size());
+          scanPalMetadata();
+          break;
+        }
+      }
+      if (err)
+        report_fatal_error("Bad PAL metadata format");
+      break;
+    }
+  }
+}
+
+// =====================================================================================================================
+// Construct from PAL metadata blob. This is only used internally for the "Re-scan the new blob to check it" code.
+RegStackUsageImpl::RegStackUsageImpl(StringRef palMetadata)
+    : m_msgPackScanner(msgPackScannerSpec), m_palMetadata(palMetadata) {
+  scanPalMetadata();
+}
+
+// =====================================================================================================================
+// Set up m_usage values by scanning PAL metadata blob.
+void RegStackUsageImpl::scanPalMetadata() {
+  // Callback function to handle an item being found by MsgPackScanner.
+  auto foundItemCallback = [this](MsgPackScanner &msgPackScanner, const MsgPackScanner::Item &item) {
+    // For backend stack usage (scratch used within a func in continuations) and frontend stack usage (CPS stack),
+    // take the maximum of multiple modules.
+    if (&item == &items.csBackendStackSize || &item == &items.funcBackendStackSize)
+      m_usage.backendStackSize = std::max(m_usage.backendStackSize, unsigned(msgPackScanner.asInt(item).value_or(0)));
+    else if (&item == &items.csFrontendStackSize || &item == &items.funcFrontendStackSize) {
+      m_usage.frontendStackSize =
+          std::max(m_usage.frontendStackSize.value_or(0), unsigned(msgPackScanner.asInt(item).value_or(0)));
+    }
+    // For other stack m_usage, sum multiple functions.
+    else if (&item == &items.funcStackFrameSizeInBytes)
+      m_usage.stackFrameSizeInBytes += msgPackScanner.asInt(item).value_or(0);
+    // For LDS and register m_usage, take the maximum of multiple functions.
+    else if (&item == &items.csLdsSize || &item == &items.funcLdsSize)
+      m_usage.ldsSize = std::max(m_usage.ldsSize, unsigned(msgPackScanner.asInt(item).value_or(0)));
+    else if (&item == &items.csSgprCount || &item == &items.funcSgprCount)
+      m_usage.sgprCount = std::max(m_usage.sgprCount, unsigned(msgPackScanner.asInt(item).value_or(0)));
+    else if (&item == &items.csVgprCount || &item == &items.funcVgprCount)
+      m_usage.vgprCount = std::max(m_usage.vgprCount, unsigned(msgPackScanner.asInt(item).value_or(0)));
+    else if (&item == &items.csMemOrdered)
+      m_usage.memOrdered = msgPackScanner.asBool(item).value_or(false);
+    // scratchEn and scratchMemorySize are read solely for the "Re-scan the new blob" check (in updateAndWrite)
+    // to work.
+    else if (&item == &items.csScratchEn)
+      m_usage.scratchEn = msgPackScanner.asBool(item).value_or(false);
+    else if (&item == &items.csScratchMemorySize)
+      m_usage.scratchMemorySize = msgPackScanner.asInt(item).value_or(0);
+    else if (&item == &items.shaderSubtype && msgPackScanner.asString(item) == "Callable")
+      ++m_usage.callableShaderCount;
+    return Error::success();
+  };
+
+  Error err = m_msgPackScanner.scan(m_palMetadata, foundItemCallback);
+  if (err)
+    report_fatal_error("Bad PAL metadata format");
+
+  LLVM_DEBUG(dbgs() << "Usage:\n" << m_usage);
+}
+
+// =====================================================================================================================
+// Construct from Module. This reads the reg/stack usage from IR metadata, as written by writeMetadata().
+RegStackUsageImpl::RegStackUsageImpl(const llvm::Module &module) : m_msgPackScanner(msgPackScannerSpec) {
+  NamedMDNode *namedNode = module.getNamedMetadata(RegStackUsageMetadataName);
+  if (namedNode && namedNode->getNumOperands() >= 1) {
+    StringRef str = dyn_cast<MDString>(namedNode->getOperand(0)->getOperand(0))->getString();
+    assert(str.size() == sizeof(m_usage));
+    memcpy(&m_usage, str.data(), sizeof(m_usage));
+  }
+}
+
+// =====================================================================================================================
+// Write the reg/stack usage into IR metadata.
+void RegStackUsageImpl::writeMetadata(Module &module) const {
+  NamedMDNode *namedNode = module.getOrInsertNamedMetadata(RegStackUsageMetadataName);
+  namedNode->clearOperands();
+  namedNode->addOperand(MDNode::get(
+      module.getContext(),
+      MDString::get(module.getContext(), StringRef(reinterpret_cast<const char *>(&m_usage), sizeof(m_usage)))));
+}
+
+// =====================================================================================================================
+// Merge reg/stack usage from one shader ELF into the accumulated merged usage in "this".
+void RegStackUsageImpl::merge(const RegStackUsageImpl &shaderUsage) {
+  assert(!m_finalized && "Cannot merge after finalizing");
+  m_usage.maxRecursionDepth = std::max(m_usage.maxRecursionDepth, shaderUsage.m_usage.maxRecursionDepth);
+  // For backend stack usage (scratch used within a func in continuations) and frontend stack usage (CPS stack),
+  // take the maximum of multiple modules.
+  m_usage.backendStackSize = std::max(m_usage.backendStackSize, shaderUsage.m_usage.backendStackSize);
+  if (m_usage.frontendStackSize || shaderUsage.m_usage.frontendStackSize) {
+    m_usage.frontendStackSize =
+        std::max(m_usage.frontendStackSize.value_or(0), shaderUsage.m_usage.frontendStackSize.value_or(0));
+  }
+  // For other stack usage, sum multiple modules.
+  m_usage.stackFrameSizeInBytes = m_usage.stackFrameSizeInBytes + shaderUsage.m_usage.stackFrameSizeInBytes;
+  // For reg/stack usage, take the maximum of multiple modules.
+  m_usage.ldsSize = std::max(m_usage.ldsSize, shaderUsage.m_usage.ldsSize);
+  m_usage.sgprCount = std::max(m_usage.sgprCount, shaderUsage.m_usage.sgprCount);
+  m_usage.vgprCount = std::max(m_usage.vgprCount, shaderUsage.m_usage.vgprCount);
+  m_usage.memOrdered = std::max(m_usage.memOrdered, shaderUsage.m_usage.memOrdered);
+
+  m_usage.callableShaderCount += shaderUsage.m_usage.callableShaderCount;
+}
+
+// =====================================================================================================================
+// Finalize merged usage in "this" (that comes from indirect shaders), merge into the supplied ELF's usage,
+// and update the PAL metadata in the ELF.
+//
+// @param (in/out) elfBuffer : Buffer containing ELF to read and update
+// @param startOffset : Start offset of the ELF in the buffer
+// @param frontendGlobalAlignment : Alignment of frontend stack for global CPS; 0 for scratch CPS
+//
+void RegStackUsageImpl::finalizeAndUpdate(SmallVectorImpl<char> &elfBuffer, size_t startOffset,
+                                          unsigned frontendGlobalAlignment) {
+  // Create a RegStackUsage for the ELF.
+  RegStackUsageImpl elfUsage(StringRef(&elfBuffer[startOffset], elfBuffer.size() - startOffset), 0, 0);
+  // Merge its usage into ours.
+  merge(elfUsage);
+  // Finalize the usage.
+  finalize(frontendGlobalAlignment);
+  // Update usage in the ELF and rewrite it.
+  elfUsage.updateAndWrite(m_usage, elfBuffer, startOffset);
+}
+
+// =====================================================================================================================
+// Finalize usage before writing back in to the launch kernel.
+void RegStackUsageImpl::finalize(unsigned frontendGlobalAlignment) {
+  assert(!m_finalized && "Cannot finalize twice");
+#ifndef NDEBUG
+  m_finalized = true;
+#endif
+  if (m_usage.frontendStackSize) {
+    // Continuations support.
+    // Currently this uses a universal whole-pipeline frontendCallDepth and multiplies it in to frontendStackSize.
+    // The calculation could be made more sophisticated by:
+    // - taking each shader's stage into account when deciding what to multiply by;
+    // - calculating separately for each rayGen and its reachable shaders, then taking the max result.
+    // The shader stage is available in PAL metadata (already used to detect callable shaders), and the rayGen
+    // usage bitmap is passed in to RegStackUsage so it can be used this way in the future.
+    m_usage.scratchMemorySize = m_usage.backendStackSize;
+    // Get frontend call depth from the max recursion depth seen for any shader.
+    unsigned frontendCallDepth = m_usage.maxRecursionDepth;
+    // If we have any callable shaders, add on an extra 2, the arbitrary API limit for callable shaders if the
+    // app does not set its own stack depth.
+    if (m_usage.callableShaderCount != 0)
+      frontendCallDepth += 2;
+    // Add on an extra 1 to cover these cases, which all happen separately at the leaf level:
+    // - At leaf level (we are not allowed to recurse), there might still be a non-reached conditional suspend
+    //   point, and the existence of this suspend point even if not reached causes potential stack usage.
+    // - The same applies to non-reached CallShader calls even if there no callable shaders.
+    // - Traversal and Intersection shaders also require the +1, as their usage is not reflected in the recursion
+    //   limit.
+    ++frontendCallDepth;
+
+    // Multiply frontendStackSize by the call depth.
+    m_usage.frontendStackSize = m_usage.frontendStackSize.value() * frontendCallDepth;
+    if (frontendGlobalAlignment == 0) {
+      // CPS stack ("frontend" stack) is allocated as a chunk out of scratch. We need to add its size on to
+      // scratchMemorySize.
+      m_usage.scratchMemorySize += m_usage.frontendStackSize.value();
+    } else {
+      // CPS stack ("global" stack) is allocated as global. We need to bump it to the specified alignment.
+      m_usage.frontendStackSize = alignToPowerOf2(m_usage.frontendStackSize.value(), frontendGlobalAlignment);
+    }
+  } else {
+    // Not continuations. Assume no recursion; we do not have any information on what the recursion depth could be.
+    // scratchMemorySize is the compute shader stack usage; stackFrameSizeInBytes is the sum of the stack usage of
+    // functions.
+    m_usage.scratchMemorySize += m_usage.stackFrameSizeInBytes;
+  }
+  m_usage.scratchEn = m_usage.scratchMemorySize != 0;
+
+  LLVM_DEBUG(dbgs() << "Finalized usage:\n" << m_usage);
+}
+
+// =====================================================================================================================
+// Update the ELF with supplied usage info, and rewrite the ELF. This could make the ELF a different size.
+//
+// @param (in/out) elfBuffer : Buffer containing ELF to update; must be the same ELF at the same location in
+//                             memory that was originally scanned by this RegStackUsage
+// @param startOffset : Start offset of the ELF in the buffer
+//
+void RegStackUsageImpl::updateAndWrite(const Usage &usage, SmallVectorImpl<char> &elfBuffer, size_t startOffset) {
+  if (usage.frontendStackSize) {
+    // Set backendStackSize even if 0, otherwise PAL gives the driver a junk value.
+    m_msgPackScanner.set(items.csBackendStackSize, usage.backendStackSize);
+    m_msgPackScanner.set(items.csFrontendStackSize, usage.frontendStackSize.value());
+  }
+  if (usage.scratchEn)
+    m_msgPackScanner.setBool(items.csScratchEn, usage.scratchEn);
+  if (usage.scratchMemorySize)
+    m_msgPackScanner.set(items.csScratchMemorySize, usage.scratchMemorySize);
+  if (usage.ldsSize)
+    m_msgPackScanner.set(items.csLdsSize, usage.ldsSize);
+  if (usage.sgprCount)
+    m_msgPackScanner.set(items.csSgprCount, usage.sgprCount);
+  if (usage.vgprCount)
+    m_msgPackScanner.set(items.csVgprCount, usage.vgprCount);
+  if (usage.memOrdered)
+    m_msgPackScanner.setBool(items.csMemOrdered, usage.memOrdered);
+
+  // Get MsgPackScanner to write the updated PAL metadata.
+  // We cannot write it directly into elfBuffer, overwriting the original ELF, because MsgPackScanner::write
+  // reads the unmodified parts of PAL metadata from there.
+  SmallString<0> newPalMetadata;
+  raw_svector_ostream stream(newPalMetadata);
+  m_msgPackScanner.write(stream);
+
+#ifndef NDEBUG
+  // Re-scan the new blob to check it.
+  // Tolerate usage.scratchEn false but newUsage.m_usage.scratchEn true as LGC seems to always set it true.
+  // Tolerate backendStackSize disagreeing if frontendStack size is 0, as we do not bother to set the former.
+  LLVM_DEBUG(dbgs() << "\nRescan the new blob\n");
+  RegStackUsageImpl newUsage(newPalMetadata);
+  assert((usage.frontendStackSize.value_or(0) == 0 || usage.backendStackSize == newUsage.m_usage.backendStackSize) &&
+         usage.frontendStackSize.value_or(0) == newUsage.m_usage.frontendStackSize.value_or(0) &&
+         usage.scratchEn <= newUsage.m_usage.scratchEn &&
+         usage.scratchMemorySize == newUsage.m_usage.scratchMemorySize && usage.ldsSize == newUsage.m_usage.ldsSize &&
+         usage.sgprCount == newUsage.m_usage.sgprCount && usage.vgprCount == newUsage.m_usage.vgprCount &&
+         usage.memOrdered == newUsage.m_usage.memOrdered);
+#endif
+
+  // Align size of both old and new PAL metadata. Pad the new PAL metadata appropriately.
+  size_t alignedOldPalMetadataSize = alignToPowerOf2(m_palMetadata.size(), m_noteAlign);
+  size_t newPalMetadataSize = newPalMetadata.size(); // Size before aligning
+  newPalMetadata.append(alignToPowerOf2(newPalMetadataSize, m_noteAlign) - newPalMetadataSize, '\0'); // Align it
+
+  // Write the new size into the .note record header that is just before the PAL metadata.
+  auto noteHeader = reinterpret_cast<object::Elf_Nhdr_Impl<object::ELFType<llvm::endianness::little, true>> *>(
+      &elfBuffer[startOffset + m_palMetadataNoteOffset]);
+  noteHeader->n_descsz = newPalMetadataSize;
+
+  // Resize and overwrite the PAL metadata blob in the ELF.
+  size_t palMetadataOffset = m_palMetadata.data() - m_elfBlob.data();
+  replaceElfData(*m_elf, elfBuffer, startOffset, palMetadataOffset, alignedOldPalMetadataSize, newPalMetadata);
+  m_elf = {};
+}
+
+// =====================================================================================================================
+// Replace some section data in an ELF.
+// Special cases of this are deleting some data (newData has 0 size) and inserting some data (oldDataSize is 0).
+// This expands or contracts the buffer as necessary, and changes the size of the section containing the change,
+// and the file offset of all sections after the change. It does not update the object::ObjectFile, which thus
+// becomes invalid.
+//
+// @param elf : ELF file object
+// @param (in/out) elfBuffer : Writable buffer containing ELF, possibly with some prefix
+// @param startOffset : Size of prefix in the buffer before we get to the ELF
+// @param dataOffset : Offset of data to remove within the ELF, and where to insert new data
+// @param oldDataSize : Size of old data to remove
+// @param newData : New data to insert in its place
+//
+void RegStackUsageImpl::replaceElfData(object::ObjectFile &elf, SmallVectorImpl<char> &elfBuffer, size_t startOffset,
+                                       size_t dataOffset, size_t oldDataSize, StringRef newData) {
+  ssize_t sizeDelta = newData.size() - oldDataSize;
+  char *elfPtr = &elfBuffer[startOffset];
+  if (sizeDelta != 0) {
+    assert((sizeDelta & 3) == 0 && "Change would upset file alignment of things after it");
+
+    // Iterate through sections to modify headers.
+    for (const object::SectionRef &section : m_elf->sections()) {
+      object::ELFSectionRef elfSection(section);
+      auto sectHeader =
+          reinterpret_cast<object::ELFType<endianness::little, true>::Shdr *>(elfSection.getRawDataRefImpl().p);
+      StringRef contents = cantFail(elfSection.getContents());
+      if (contents.begin() - elfPtr <= dataOffset && contents.end() - elfPtr > dataOffset) {
+        // This section contains the data being replaced. Change its size.
+        sectHeader->sh_size += sizeDelta;
+      } else if (contents.begin() - elfPtr > dataOffset) {
+        // This section is after the data being replaced. Change its file offset.
+        sectHeader->sh_offset += sizeDelta;
+      }
+    }
+
+    // Modify offsets in ELF header.
+    auto elfHeader = reinterpret_cast<object::ELFType<endianness::little, true>::Ehdr *>(&elfBuffer[startOffset]);
+    assert(elfHeader->e_phoff == 0 && "Executable ELF not supported");
+    if (elfHeader->e_shoff > dataOffset)
+      elfHeader->e_shoff += sizeDelta;
+
+    // Resize the ELF appropriately.
+    size_t oldElfSize = elfBuffer.size() - startOffset;
+    if (sizeDelta > 0) {
+      elfBuffer.resize(elfBuffer.size() + sizeDelta);
+      elfPtr = &elfBuffer[startOffset];
+    }
+    memmove(elfPtr + dataOffset + newData.size(), elfPtr + dataOffset + oldDataSize,
+            oldElfSize - (dataOffset + oldDataSize));
+    if (sizeDelta < 0) {
+      elfBuffer.resize(elfBuffer.size() + sizeDelta);
+      elfPtr = &elfBuffer[startOffset];
+    }
+  }
+
+  // Write the new data.
+  memcpy(elfPtr + dataOffset, newData.data(), newData.size());
+}
diff --git a/llpc/CMakeLists.txt b/llpc/CMakeLists.txt
index 825530be64..74935e3b07 100644
--- a/llpc/CMakeLists.txt
+++ b/llpc/CMakeLists.txt
@@ -49,31 +49,37 @@ if(ICD_BUILD_LLPC)
     add_lgc_projects()
 
     # Set other LLVM settings.
-    set(LLVM_TARGETS_TO_BUILD AMDGPU CACHE STRING Force)
-    set(LLVM_BUILD_TESTS OFF CACHE BOOL Force)
-    set(LLVM_BUILD_TOOLS ${LLPC_BUILD_LLVM_TOOLS} CACHE BOOL Force)
-    set(LLVM_BUILD_UTILS OFF CACHE BOOL Force)
-    set(LLVM_INCLUDE_DOCS OFF CACHE BOOL Force)
-    set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL Force)
-    set(LLVM_INCLUDE_GO_TESTS OFF CACHE BOOL Force)
     set(LLVMRAYTRACING_BUILD_TESTS ${LLPC_BUILD_TESTS})
-    set(LLVM_INCLUDE_TESTS ${LLPC_BUILD_TESTS} CACHE BOOL Force)
-    set(LLVM_INCLUDE_TOOLS ON CACHE BOOL Force)
-    set(LLVM_INCLUDE_UTILS ON CACHE BOOL Force)
-    set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL Force)
-    set(LLVM_RAM_PER_TABLEGEN_JOB 10000 CACHE STRING Force)
-    set(LLVM_RAM_PER_LINK_JOB 5000 CACHE STRING Force)
-    if (NOT WIN32)
+    set(LLVM_TARGETS_TO_BUILD AMDGPU CACHE STRING "LLVM targets to build")
+    set(LLVM_BUILD_TESTS OFF CACHE BOOL "LLVM build tests")
+    set(LLVM_BUILD_TOOLS ${LLPC_BUILD_LLVM_TOOLS} CACHE BOOL "LLVM build tools")
+    set(LLVM_BUILD_UTILS OFF CACHE BOOL "LLVM build utils")
+    set(LLVM_INCLUDE_DOCS OFF CACHE BOOL "LLVM include docs")
+    set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "LLVM include examples")
+    set(LLVM_INCLUDE_GO_TESTS OFF CACHE BOOL "LLVM include go tests")
+    set(LLVM_INCLUDE_TESTS ${LLPC_BUILD_TESTS} CACHE BOOL "LLVM include tests")
+    set(LLVM_INCLUDE_TOOLS ON CACHE BOOL "LLVM include tools")
+    set(LLVM_INCLUDE_UTILS ON CACHE BOOL "LLVM include utils")
+    set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "LLVM enable terminfo")
+    set(LLVM_RAM_PER_TABLEGEN_JOB 4000 CACHE STRING "LLVM RAM per tablegen job")
+    set(LLVM_RAM_PER_LINK_JOB 5000 CACHE STRING "LLVM RAM per link job")
+    if(CMAKE_BUILD_TYPE_DEBUG)
         # Build optimized version of llvm-tblgen even in debug builds, for faster build times.
+        set(LLVM_OPTIMIZED_TABLEGEN ON CACHE BOOL "Build optimized llvm-tblgen")
 #if _WIN32
-        # Don't enable this on Windows, because the required "cross compile" setup doesn't work
-        # in the internal CMake setup on Windows.
+        if(LLVM_OPTIMIZED_TABLEGEN AND WIN32 AND (CMAKE_GENERATOR MATCHES "Ninja"))
+            # LLVM implements the Release build of llvm-tblgen as a cross-compile target, which fails to find
+            # our DK-based toolchain (created with amd_generate_msvc_toolchain). However, we can inject the toolchain
+            # argument into LLVM's add_custom_target that sets up this cross-compile build.
+            # See: llvm-project/llvm/cmake/modules/CrossCompile.cmake
+            set(CROSS_TOOLCHAIN_FLAGS_NATIVE "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}" CACHE STRING
+                "Toolchain flags for native build" FORCE)
+        endif()
 #endif
-        set(LLVM_OPTIMIZED_TABLEGEN ON CACHE BOOL Force)
     endif()
 
     # This will greatly speed up debug builds because we won't be listing all the symbols with llvm-nm.
-    set(LLVM_BUILD_LLVM_C_DYLIB OFF CACHE BOOL Force)
+    set(LLVM_BUILD_LLVM_C_DYLIB OFF CACHE BOOL "LLVM build LLVM-C dylib")
 
     if(EXISTS ${PROJECT_SOURCE_DIR}/../../../imported/llvm-project/llvm)
         set(XGL_LLVM_SRC_PATH ${PROJECT_SOURCE_DIR}/../../../imported/llvm-project/llvm CACHE PATH "Specify the path to the LLVM.")
@@ -142,6 +148,29 @@ endif()
 include(../cmake/CompilerFlags.cmake)
 set_compiler_options(llpcinternal ${LLPC_ENABLE_WERROR})
 
+### TableGen for LLPC dialect ########################################################################################
+if (ICD_BUILD_LLPC)
+    if (EXISTS ${LLVM_TOOLS_BINARY_PATH}/llvm-dialects-tblgen)
+      set(LLPC_TABLEGEN_EXE ${LLVM_TOOLS_BINARY_PATH}/llvm-dialects-tblgen)
+    else()
+      set(LLPC_TABLEGEN_EXE $<TARGET_FILE:llvm-dialects-tblgen>)
+    endif()
+    set(LLPC_TABLEGEN_TARGET llvm-dialects-tblgen)
+    set(LLVM_TARGET_DEFINITIONS include/LlpcDialect.td)
+
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/include")
+    tablegen(LLPC include/LlpcDialect.h.inc -gen-dialect-decls --dialect llpc
+        EXTRA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/../imported/llvm-dialects/include
+        )
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/include/context")
+    tablegen(LLPC include/context/LlpcDialect.cpp.inc -gen-dialect-defs --dialect llpc
+        EXTRA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/../imported/llvm-dialects/include
+        )
+
+    add_public_tablegen_target(LlpcDialectTableGen)
+    add_dependencies(llpcinternal LlpcDialectTableGen)
+endif()
+
 ### Defines/Includes/Sources ###########################################################################################
 if(ICD_BUILD_LLPC)
     list(APPEND CMAKE_MODULE_PATH
@@ -192,6 +221,7 @@ target_include_directories(llpcinternal
         translator/lib/SPIRV/libSPIRV
         util
         ../util
+        ${PROJECT_BINARY_DIR}/include
         ${XGL_PAL_PATH}/inc/core
         ${XGL_PAL_PATH}/inc/util
         ${LLVM_INCLUDE_DIRS}
@@ -207,72 +237,123 @@ if(ICD_BUILD_LLPC)
 # llpc/context
     target_sources(llpcinternal PRIVATE
         context/llpcCompiler.cpp
-        context/llpcContext.cpp
+        context/llpcCompiler.h
         context/llpcComputeContext.cpp
+        context/llpcComputeContext.h
+        context/llpcContext.cpp
+        context/llpcContext.h
+        context/llpcDialect.cpp
+        context/llpcDialect.h
         context/llpcGraphicsContext.cpp
+        context/llpcGraphicsContext.h
         context/llpcPipelineContext.cpp
+        context/llpcPipelineContext.h
         context/llpcRayTracingContext.cpp
+        context/llpcRayTracingContext.h
     )
 
 # llpc/lowering
     target_sources(llpcinternal PRIVATE
-        lowering/Lowering.cpp
         lowering/LowerAccessChain.cpp
+        lowering/LowerAccessChain.h
+        lowering/LowerAdvancedBlend.cpp
+        lowering/LowerAdvancedBlend.h
         lowering/LowerCfgMerges.cpp
+        lowering/LowerCfgMerges.h
         lowering/LowerConstImmediateStore.cpp
+        lowering/LowerConstImmediateStore.h
+        lowering/LowerCooperativeMatrix.cpp
+        lowering/LowerCooperativeMatrix.h
+        lowering/LowerGlCompatibility.cpp
+        lowering/LowerGlCompatibility.h
         lowering/LowerGlobals.cpp
+        lowering/LowerGlobals.h
         lowering/LowerInstMetaRemove.cpp
+        lowering/LowerInstMetaRemove.h
+        lowering/Lowering.cpp
+        lowering/Lowering.h
         lowering/LowerMath.cpp
+        lowering/LowerMath.h
         lowering/LowerMemoryOp.cpp
+        lowering/LowerMemoryOp.h
         lowering/LowerPostInline.cpp
+        lowering/LowerPostInline.h
         lowering/LowerRayTracing.cpp
+        lowering/LowerRayTracing.h
         lowering/LowerTerminator.cpp
+        lowering/LowerTerminator.h
         lowering/LowerTranslator.cpp
+        lowering/LowerTranslator.h
         lowering/LoweringUtil.cpp
-        lowering/ProcessGpuRtLibrary.cpp
+        lowering/LoweringUtil.h
         lowering/LowerInternalLibraryIntrinsic.cpp
-        lowering/LowerGlCompatibility.cpp
-        lowering/ScalarReplacementOfBuiltins.cpp
-        lowering/LowerCooperativeMatrix.cpp
+        lowering/LowerInternalLibraryIntrinsic.h
+        lowering/ProcessGpuRtLibrary.cpp
+        lowering/ProcessGpuRtLibrary.h
         lowering/PrepareContinuations.cpp
-        lowering/LowerAdvancedBlend.cpp
+        lowering/PrepareContinuations.h
+        lowering/ScalarReplacementOfBuiltins.cpp
+        lowering/ScalarReplacementOfBuiltins.h
         lowering/ProcessGfxRuntimeLibrary.cpp
+        lowering/ProcessGfxRuntimeLibrary.h
     )
 
 # llpc/translator
     target_sources(llpcinternal PRIVATE
         translator/lib/SPIRV/SPIRVReader.cpp
+        translator/lib/SPIRV/SPIRVReader.h
         translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp
+        translator/lib/SPIRV/SPIRVToLLVMDbgTran.h
         translator/lib/SPIRV/SPIRVUtil.cpp
     )
 
     target_sources(llpcinternal PRIVATE
         translator/lib/SPIRV/libSPIRV/SPIRVBasicBlock.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVBasicBlock.h
         translator/lib/SPIRV/libSPIRV/SPIRVDebug.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVDebug.h
         translator/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVDecorate.h
         translator/lib/SPIRV/libSPIRV/SPIRVEntry.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVEntry.h
         translator/lib/SPIRV/libSPIRV/SPIRVFunction.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVFunction.h
         translator/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h
         translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVModule.h
         translator/lib/SPIRV/libSPIRV/SPIRVStream.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVStream.h
         translator/lib/SPIRV/libSPIRV/SPIRVType.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVType.h
         translator/lib/SPIRV/libSPIRV/SPIRVValue.cpp
+        translator/lib/SPIRV/libSPIRV/SPIRVValue.h
+        translator/lib/SPIRV/libSPIRV/SPIRVUtil.h
     )
 
 # llpc/util
     target_sources(llpcinternal PRIVATE
         util/llpcCacheAccessor.cpp
+        util/llpcCacheAccessor.h
         util/llpcDebug.cpp
+        util/llpcDebug.h
         util/llpcElfWriter.cpp
+        util/llpcElfWriter.h
         util/llpcError.cpp
+        util/llpcError.h
         util/llpcFile.cpp
+        util/llpcFile.h
         util/llpcShaderModuleHelper.cpp
+        util/llpcShaderModuleHelper.h
         util/llpcTimerProfiler.cpp
+        util/llpcTimerProfiler.h
         util/llpcUtil.cpp
+        util/llpcUtil.h
     )
 else()
     target_sources(llpcinternal PRIVATE
         util/llpcUtil.cpp
+        util/llpcUtil.h
     )
 endif()
 
@@ -354,14 +435,23 @@ if(ICD_BUILD_LLPC)
 # Add a common library for standalone compilers based on LLPC.
 add_library(llpc_standalone_compiler
     tool/llpcAutoLayout.cpp
+    tool/llpcAutoLayout.h
     tool/llpcCompilationUtils.cpp
+    tool/llpcCompilationUtils.h
     tool/llpcComputePipelineBuilder.cpp
+    tool/llpcComputePipelineBuilder.h
     tool/llpcGraphicsPipelineBuilder.cpp
+    tool/llpcGraphicsPipelineBuilder.h
     tool/llpcInputUtils.cpp
+    tool/llpcInputUtils.h
     tool/llpcPipelineBuilder.cpp
+    tool/llpcPipelineBuilder.h
     tool/llpcRayTracingPipelineBuilder.cpp
+    tool/llpcRayTracingPipelineBuilder.h
     tool/llpcShaderCache.cpp
+    tool/llpcShaderCache.h
     tool/llpcShaderCacheWrap.cpp
+    tool/llpcShaderCacheWrap.h
 )
 
 add_dependencies(llpc_standalone_compiler llpc)
diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp
index fb897e9acd..fa1a0dcd0f 100644
--- a/llpc/context/llpcCompiler.cpp
+++ b/llpc/context/llpcCompiler.cpp
@@ -96,6 +96,7 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/Scalar/SROA.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <cassert>
 #include <condition_variable>
@@ -377,8 +378,11 @@ Result VKAPI_CALL ICompiler::Create(GfxIpVersion gfxIp, unsigned optionCount, co
   assert(gfxIp.major >= 10); // Only accept GFx10+
   Result result = Result::Success;
 
-  const char *client = options[0];
-  bool ignoreErrors = (strcmp(client, VkIcdName) == 0);
+  StringRef client(options[0]);
+  bool ignoreErrors = (client == VkIcdName);
+
+  // Set API name according to client
+  const char *apiName = (client == VkIcdName || client == VkCompilerName) ? "Vulkan" : "OpenGL";
 
   raw_null_ostream nullStream;
 
@@ -416,7 +420,7 @@ Result VKAPI_CALL ICompiler::Create(GfxIpVersion gfxIp, unsigned optionCount, co
 
   if (result == Result::Success) {
     SOptionHash = optionHash;
-    *ppCompiler = new Compiler(gfxIp, optionCount, options, SOptionHash, cache);
+    *ppCompiler = new Compiler(gfxIp, apiName, optionCount, options, SOptionHash, cache);
     assert(*ppCompiler);
   } else {
     *ppCompiler = nullptr;
@@ -437,13 +441,14 @@ bool VKAPI_CALL ICompiler::IsVertexFormatSupported(VkFormat format) {
 // =====================================================================================================================
 //
 // @param gfxIp : Graphics IP version info
+// @param apiName : API name from client, "Vulkan" or "OpenGL"
 // @param optionCount : Count of compilation-option strings
 // @param options : An array of compilation-option strings
 // @param optionHash : Hash code of compilation options
 // @param cache : Pointer to ICache implemented in client
-Compiler::Compiler(GfxIpVersion gfxIp, unsigned optionCount, const char *const *options, MetroHash::Hash optionHash,
-                   ICache *cache)
-    : m_gfxIp(gfxIp), m_cache(cache), m_relocatablePipelineCompilations(0) {
+Compiler::Compiler(GfxIpVersion gfxIp, const char *apiName, unsigned optionCount, const char *const *options,
+                   MetroHash::Hash optionHash, ICache *cache)
+    : m_gfxIp(gfxIp), m_apiName(apiName), m_cache(cache), m_relocatablePipelineCompilations(0) {
   for (unsigned i = 0; i < optionCount; ++i)
     m_options.push_back(options[i]);
 
@@ -638,18 +643,93 @@ Result Compiler::BuildShaderModule(const ShaderModuleBuildInfo *shaderInfo, Shad
     return Result::ErrorInvalidPointer;
   }
 
-  auto codeSizeOrErr = ShaderModuleHelper::getCodeSize(shaderInfo);
+  // Check if we can get data from cache
+  CacheAccessor cacheAccessor(hash, getInternalCaches());
+  if (cacheAccessor.isInCache()) {
+    BinaryData dataInCache = cacheAccessor.getElfFromCache();
+
+    uint8_t *allocBuf = static_cast<uint8_t *>(
+        shaderInfo->pfnOutputAlloc(shaderInfo->pInstance, shaderInfo->pUserData, dataInCache.codeSize));
+    if (!allocBuf)
+      return Result::ErrorOutOfMemory;
+
+    uint8_t *bufferWritePtr = allocBuf;
+    memcpy(bufferWritePtr, dataInCache.pCode, dataInCache.codeSize);
+
+    ShaderModuleData *shaderModuleData = reinterpret_cast<ShaderModuleData *>(bufferWritePtr);
+    bufferWritePtr += sizeof(ShaderModuleData);
+
+    shaderModuleData->binCode.pCode = bufferWritePtr;
+    bufferWritePtr += shaderModuleData->binCode.codeSize;
+
+    if (shaderInfo->options.pipelineOptions.getGlState().buildResourcesDataForShaderModule &&
+        shaderModuleData->binType == BinaryType::Spirv) {
+      ResourcesNodes *resources = reinterpret_cast<ResourcesNodes *>(bufferWritePtr);
+      shaderModuleData->usage.pResources = resources;
+      bufferWritePtr += sizeof(ResourcesNodes);
+
+      resources->pInputInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->inputInfoCount * sizeof(ResourceNodeData);
+
+      resources->pOutputInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->outputInfoCount * sizeof(ResourceNodeData);
+
+      resources->pUniformBufferInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->uniformBufferInfoCount * sizeof(ResourceNodeData);
+
+      resources->pShaderStorageInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->shaderStorageInfoCount * sizeof(ResourceNodeData);
+
+      resources->pTexturesInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->textureInfoCount * sizeof(ResourceNodeData);
+
+      resources->pImagesInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->imageInfoCount * sizeof(ResourceNodeData);
+
+      resources->pAtomicCounterInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->atomicCounterInfoCount * sizeof(ResourceNodeData);
+
+      resources->pDefaultUniformInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+      bufferWritePtr += shaderModuleData->usage.pResources->defaultUniformInfoCount * sizeof(ResourceNodeData);
+    }
+
+    shaderOut->pModuleData = shaderModuleData;
+
+    if (shaderModuleData->binType == BinaryType::Spirv && cl::EnablePipelineDump) {
+      // Dump the original input binary, since the offline tool will re-run BuildShaderModule
+      PipelineDumper::DumpSpirvBinary(cl::PipelineDumpDir.c_str(), &shaderInfo->shaderBin, &hash);
+    }
+
+    return Result::Success;
+  }
+
+  auto codeSizeOrErr = ShaderModuleHelper::getShaderCodeSize(shaderInfo);
   if (Error err = codeSizeOrErr.takeError())
     return errorToResult(std::move(err));
 
   const unsigned codeSize = *codeSizeOrErr;
   size_t allocSize = sizeof(ShaderModuleData) + codeSize;
 
-  ShaderModuleData moduleData = {};
   std::vector<unsigned> codeBufferVector(codeSize / sizeof(unsigned));
   MutableArrayRef<unsigned> codeBuffer(codeBufferVector);
+
+  ShaderModuleData moduleData = {};
+  Result result = ShaderModuleHelper::getShaderBinaryType(shaderInfo->shaderBin, moduleData.binType);
+  if (result != Result::Success)
+    return result;
+
   memcpy(moduleData.hash, &hash, sizeof(hash));
-  Result result = ShaderModuleHelper::getModuleData(shaderInfo, codeBuffer, moduleData);
+
+  std::unique_ptr<SPIRVModule> module;
+  if (moduleData.binType == BinaryType::Spirv) {
+    // Parser SPIR-V binary
+    std::string spvCode(static_cast<const char *>(shaderInfo->shaderBin.pCode), shaderInfo->shaderBin.codeSize);
+    std::istringstream spvStream(spvCode);
+    module.reset(SPIRVModule::createSPIRVModule());
+    spvStream >> *module;
+  }
+
+  result = ShaderModuleHelper::getModuleData(shaderInfo, module.get(), codeBuffer, moduleData);
 
   ResourcesNodes resourceNodes = {};
   std::vector<ResourceNodeData> inputSymbolInfo;
@@ -662,9 +742,9 @@ Result Compiler::BuildShaderModule(const ShaderModuleBuildInfo *shaderInfo, Shad
   std::vector<ResourceNodeData> defaultUniformSymbolInfo;
   if (shaderInfo->options.pipelineOptions.getGlState().buildResourcesDataForShaderModule &&
       moduleData.binType == BinaryType::Spirv) {
-    buildShaderModuleResourceUsage(shaderInfo, resourceNodes, inputSymbolInfo, outputSymbolInfo, uniformBufferInfo,
-                                   storageBufferInfo, textureSymbolInfo, imageSymbolInfo, atomicCounterSymbolInfo,
-                                   defaultUniformSymbolInfo, moduleData.usage);
+    buildShaderModuleResourceUsage(shaderInfo, module.get(), resourceNodes, inputSymbolInfo, outputSymbolInfo,
+                                   uniformBufferInfo, storageBufferInfo, textureSymbolInfo, imageSymbolInfo,
+                                   atomicCounterSymbolInfo, defaultUniformSymbolInfo, moduleData.usage);
 
     allocSize += sizeof(ResourcesNodes);
     allocSize += inputSymbolInfo.size() * sizeof(ResourceNodeData);
@@ -692,58 +772,60 @@ Result Compiler::BuildShaderModule(const ShaderModuleBuildInfo *shaderInfo, Shad
     return Result::ErrorOutOfMemory;
 
   uint8_t *bufferWritePtr = allocBuf;
-  ShaderModuleData *pShaderModuleData = nullptr;
-  ResourcesNodes *pResourcesNodes = nullptr;
-
   memcpy(bufferWritePtr, &moduleData, sizeof(moduleData));
-  pShaderModuleData = reinterpret_cast<ShaderModuleData *>(bufferWritePtr);
+
+  ShaderModuleData *shaderModuleData = reinterpret_cast<ShaderModuleData *>(bufferWritePtr);
   bufferWritePtr += sizeof(ShaderModuleData);
 
   memcpy(bufferWritePtr, codeBuffer.data(), codeBuffer.size() * sizeof(unsigned));
-  pShaderModuleData->binCode.pCode = bufferWritePtr;
+  shaderModuleData->binCode.pCode = bufferWritePtr;
   bufferWritePtr += codeBuffer.size() * sizeof(unsigned);
 
   if (shaderInfo->options.pipelineOptions.getGlState().buildResourcesDataForShaderModule &&
       moduleData.binType == BinaryType::Spirv) {
     memcpy(bufferWritePtr, &resourceNodes, sizeof(ResourcesNodes));
-    pResourcesNodes = reinterpret_cast<ResourcesNodes *>(bufferWritePtr);
-    pShaderModuleData->usage.pResources = pResourcesNodes;
+    ResourcesNodes *resources = reinterpret_cast<ResourcesNodes *>(bufferWritePtr);
+    shaderModuleData->usage.pResources = resources;
     bufferWritePtr += sizeof(ResourcesNodes);
 
     memcpy(bufferWritePtr, inputSymbolInfo.data(), inputSymbolInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pInputInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pInputInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += inputSymbolInfo.size() * sizeof(ResourceNodeData);
 
     memcpy(bufferWritePtr, outputSymbolInfo.data(), outputSymbolInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pOutputInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pOutputInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += outputSymbolInfo.size() * sizeof(ResourceNodeData);
 
     memcpy(bufferWritePtr, uniformBufferInfo.data(), uniformBufferInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pUniformBufferInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pUniformBufferInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += uniformBufferInfo.size() * sizeof(ResourceNodeData);
 
     memcpy(bufferWritePtr, storageBufferInfo.data(), storageBufferInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pShaderStorageInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pShaderStorageInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += storageBufferInfo.size() * sizeof(ResourceNodeData);
 
     memcpy(bufferWritePtr, textureSymbolInfo.data(), textureSymbolInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pTexturesInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pTexturesInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += textureSymbolInfo.size() * sizeof(ResourceNodeData);
 
     memcpy(bufferWritePtr, imageSymbolInfo.data(), imageSymbolInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pImagesInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pImagesInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += imageSymbolInfo.size() * sizeof(ResourceNodeData);
 
     memcpy(bufferWritePtr, atomicCounterSymbolInfo.data(), atomicCounterSymbolInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pAtomicCounterInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pAtomicCounterInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += atomicCounterSymbolInfo.size() * sizeof(ResourceNodeData);
 
     memcpy(bufferWritePtr, defaultUniformSymbolInfo.data(), defaultUniformSymbolInfo.size() * sizeof(ResourceNodeData));
-    pResourcesNodes->pDefaultUniformInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
+    resources->pDefaultUniformInfo = reinterpret_cast<ResourceNodeData *>(bufferWritePtr);
     bufferWritePtr += defaultUniformSymbolInfo.size() * sizeof(ResourceNodeData);
   }
 
-  shaderOut->pModuleData = pShaderModuleData;
+  shaderOut->pModuleData = shaderModuleData;
+
+  // Add data to cache
+  BinaryData dataToCache = {allocSize, allocBuf};
+  cacheAccessor.setElfInCache(dataToCache);
 
   if (moduleData.binType == BinaryType::Spirv && cl::EnablePipelineDump) {
     // Dump the original input binary, since the offline tool will re-run BuildShaderModule
@@ -975,10 +1057,11 @@ static void getShaderModuleUsageFromInst(SPIRVModule *module, SPIRVFunction *fun
 }
 
 // =====================================================================================================================
-// Parse the spirv binary to build the resource node data for buffers and opaque types, the resource node data will be
-// returned to client driver together with other info of ShaderModuleUsage
+// Analyze the SPIR-V module to build the resource node data for buffers and opaque types, the resource node data will
+// be returned to client driver together with other info of ShaderModuleUsage
 //
-// @param shaderInfo : Input shader info, including spirv binary
+// @param shaderInfo : Input shader info
+// @param module : SPIR-V module
 // @param [out] resourcesNodes : Output of resource usage
 // @param [out] inputSymbolInfos : Output of input symbol infos
 // @param [out] outputSymbolInfo : Output of output symbol infos
@@ -989,18 +1072,12 @@ static void getShaderModuleUsageFromInst(SPIRVModule *module, SPIRVFunction *fun
 // @param [out] atomicCounterSymbolInfo : Output of atomic counter symbol infos
 // @param [out] defaultUniformSymbolInfo : Output of default uniform symbol infos
 void Compiler::buildShaderModuleResourceUsage(
-    const ShaderModuleBuildInfo *shaderInfo, Vkgc::ResourcesNodes &resourcesNodes,
+    const ShaderModuleBuildInfo *shaderInfo, SPIRVModule *module, Vkgc::ResourcesNodes &resourcesNodes,
     std::vector<ResourceNodeData> &inputSymbolInfo, std::vector<ResourceNodeData> &outputSymbolInfo,
     std::vector<ResourceNodeData> &uniformBufferInfo, std::vector<ResourceNodeData> &storageBufferInfo,
     std::vector<ResourceNodeData> &textureSymbolInfo, std::vector<ResourceNodeData> &imageSymbolInfo,
     std::vector<ResourceNodeData> &atomicCounterSymbolInfo, std::vector<ResourceNodeData> &defaultUniformSymbolInfo,
     ShaderModuleUsage &shaderModuleUsage) {
-  // Parse the SPIR-V stream.
-  std::string spirvCode(static_cast<const char *>(shaderInfo->shaderBin.pCode), shaderInfo->shaderBin.codeSize);
-  std::istringstream spirvStream(spirvCode);
-  std::unique_ptr<SPIRVModule> module(SPIRVModule::createSPIRVModule());
-  spirvStream >> *module;
-
   ShaderStage shaderStage = shaderInfo->entryStage;
   std::set<unsigned> texelFetchImageIds;
 
@@ -1009,7 +1086,7 @@ void Compiler::buildShaderModuleResourceUsage(
   SPIRVFunction *func = nullptr;
   for (unsigned i = 0, funcCount = module->getNumFunctions(); i < funcCount; ++i) {
     func = module->getFunction(i);
-    getShaderModuleUsageFromInst(module.get(), func, texelFetchImageIds, shaderModuleUsage);
+    getShaderModuleUsageFromInst(module, func, texelFetchImageIds, shaderModuleUsage);
     entryPoint = module->getEntryPoint(func->getId());
     if (entryPoint && entryPoint->getExecModel() == convertToExecModel(shaderStage) &&
         entryPoint->getName() == shaderInfo->pEntryTarget)
@@ -1046,7 +1123,7 @@ void Compiler::buildShaderModuleResourceUsage(
     }
   }
 
-  // Spirv Reader will expand matrix to vector arrays.
+  // SPIR-V Reader will expand matrix to vector arrays.
   // Add more rsrc node here to avoid poison value in vtxFetch.
   if (shaderInfo->entryStage == ShaderStage::ShaderStageVertex) {
     size_t inputSymbolSize = inputSymbolWithArrayInfo.size();
@@ -1229,7 +1306,7 @@ Result Compiler::buildGraphicsShaderStage(const GraphicsPipelineBuildInfo *pipel
   pipelineHash = PipelineDumper::generateHashForGraphicsPipeline(pipelineInfo, false, stage);
 
   // Compile
-  GraphicsContext graphicsContext(m_gfxIp, pipelineInfo, &pipelineHash, &cacheHash);
+  GraphicsContext graphicsContext(m_gfxIp, m_apiName, pipelineInfo, &pipelineHash, &cacheHash);
   Context *context = acquireContext();
   context->attachPipelineContext(&graphicsContext);
   auto onExit = make_scope_exit([&] { releaseContext(context); });
@@ -1345,7 +1422,10 @@ Result Compiler::BuildColorExportShader(const GraphicsPipelineBuildInfo *pipelin
   hasher.Update(pipelineInfo->cbState);
   hasher.Finalize(pipelineHash.bytes);
 
-  GraphicsContext graphicsContext(m_gfxIp, pipelineInfo, &pipelineHash, &cacheHash);
+  // For color export shader, we don't use this cacheHash to cache ELF (re-calculate cache hash later), but some tools
+  // expect it is not zero, make a simply copy from pipelineHash here, which only affects '.internal_pipeline_hash'.
+  cacheHash = pipelineHash;
+  GraphicsContext graphicsContext(m_gfxIp, m_apiName, pipelineInfo, &pipelineHash, &cacheHash);
   Context *context = acquireContext();
   context->attachPipelineContext(&graphicsContext);
   LgcContext *builderContext = context->getLgcContext();
@@ -1490,7 +1570,7 @@ Result Compiler::buildGraphicsPipelineWithElf(const GraphicsPipelineBuildInfo *p
     if (cacheAccessor && pipelineOut->pipelineCacheAccess == CacheAccessInfo::CacheNotChecked)
       pipelineOut->pipelineCacheAccess = CacheAccessInfo::CacheMiss;
 
-    GraphicsContext graphicsContext(m_gfxIp, pipelineInfo, &pipelineHash, &cacheHash);
+    GraphicsContext graphicsContext(m_gfxIp, m_apiName, pipelineInfo, &pipelineHash, &cacheHash);
     Context *context = acquireContext();
     context->attachPipelineContext(&graphicsContext);
 
@@ -1938,6 +2018,8 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef<const Pipeline
         result = Result::ErrorInvalidShader;
       }
 
+      context->getBuilder()->SetCurrentDebugLocation(nullptr);
+
       // Add the shader module to the list for the pipeline.
       modulesToLink.push_back(std::move(modules[shaderIndex]));
     }
@@ -2410,7 +2492,7 @@ Result Compiler::BuildGraphicsPipeline(const GraphicsPipelineBuildInfo *pipeline
 
   if (!cacheAccessor || !cacheAccessor->isInCache()) {
     LLPC_OUTS("Cache miss for graphics pipeline.\n");
-    GraphicsContext graphicsContext(m_gfxIp, pipelineInfo, &pipelineHash, &cacheHash);
+    GraphicsContext graphicsContext(m_gfxIp, m_apiName, pipelineInfo, &pipelineHash, &cacheHash);
     result = buildGraphicsPipelineInternal(&graphicsContext, shaderInfo, buildUsingRelocatableElf, &candidateElf,
                                            pipelineOut->stageCacheAccesses);
 
@@ -2539,7 +2621,7 @@ Result Compiler::BuildComputePipeline(const ComputePipelineBuildInfo *pipelineIn
   ElfPackage candidateElf;
   if (!cacheAccessor || !cacheAccessor->isInCache()) {
     LLPC_OUTS("Cache miss for compute pipeline.\n");
-    ComputeContext computeContext(m_gfxIp, pipelineInfo, &pipelineHash, &cacheHash);
+    ComputeContext computeContext(m_gfxIp, m_apiName, pipelineInfo, &pipelineHash, &cacheHash);
     result = buildComputePipelineInternal(&computeContext, pipelineInfo, buildUsingRelocatableElf, &candidateElf,
                                           &pipelineOut->stageCacheAccess);
 
@@ -2632,8 +2714,8 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe
   if (pipelineInfo->shaderCount > 0)
     representativeShaderInfo = &pipelineInfo->pShaders[0];
 
-  RayTracingContext rayTracingContext(m_gfxIp, pipelineInfo, representativeShaderInfo, &pipelineHash, &cacheHash,
-                                      pipelineInfo->indirectStageMask);
+  RayTracingContext rayTracingContext(m_gfxIp, m_apiName, pipelineInfo, representativeShaderInfo, &pipelineHash,
+                                      &cacheHash, pipelineInfo->indirectStageMask);
   auto &summary = rayTracingContext.getRayTracingLibrarySummary();
   summary.knownSetRayFlags = ~0;
   summary.knownUnsetRayFlags = ~0;
@@ -2843,6 +2925,11 @@ Result Compiler::buildRayTracingPipelineElf(Context *context, std::unique_ptr<Mo
     std::unique_ptr<lgc::PassManager> passMgr(lgc::PassManager::Create(context->getLgcContext()));
     passMgr->registerModuleAnalysis([&] { return DialectContextAnalysis(false); });
     passMgr->addPass(LowerRaytracingPipelinePass());
+
+    // SpecializeDriverShadersPass relies on allocas introduced by LowerRaytracingPipelinePass being eliminated by SROA
+    passMgr->addPass(createModuleToFunctionPassAdaptor(SROAPass(llvm::SROAOptions::ModifyCFG)));
+    passMgr->addPass(SpecializeDriverShadersPass());
+
     bool success = runPasses(&*passMgr, module.get());
     assert(success);
     (void(success)); // unused
diff --git a/llpc/context/llpcCompiler.h b/llpc/context/llpcCompiler.h
index 591b9e2e36..03fdd9f54b 100644
--- a/llpc/context/llpcCompiler.h
+++ b/llpc/context/llpcCompiler.h
@@ -56,6 +56,12 @@ enum class PipelineLink : unsigned;
 
 } // namespace lgc
 
+namespace SPIRV {
+
+class SPIRVModule;
+
+} // namespace SPIRV
+
 namespace Llpc {
 
 using Vkgc::ElfPackage;
@@ -98,8 +104,8 @@ class GraphicsShaderCacheChecker {
 // Represents LLPC pipeline compiler.
 class Compiler : public ICompiler {
 public:
-  Compiler(GfxIpVersion gfxIp, unsigned optionCount, const char *const *options, MetroHash::Hash optionHash,
-           Vkgc::ICache *cache);
+  Compiler(GfxIpVersion gfxIp, const char *apiName, unsigned optionCount, const char *const *options,
+           MetroHash::Hash optionHash, Vkgc::ICache *cache);
   ~Compiler();
 
   virtual void VKAPI_CALL Destroy();
@@ -204,6 +210,7 @@ class Compiler : public ICompiler {
 
   std::vector<std::string> m_options;           // Compilation options
   GfxIpVersion m_gfxIp;                         // Graphics IP version info
+  const char *m_apiName;                        // API name from client, "Vulkan" or "OpenGL"
   Vkgc::ICache *m_cache;                        // Point to ICache implemented in client
   static unsigned m_instanceCount;              // The count of compiler instance
   static unsigned m_outRedirectCount;           // The count of output redirect
@@ -215,7 +222,7 @@ class Compiler : public ICompiler {
                                                                       // wait for main thread switching context
 
   void buildShaderModuleResourceUsage(
-      const ShaderModuleBuildInfo *shaderInfo, Vkgc::ResourcesNodes &resourcesNodes,
+      const ShaderModuleBuildInfo *shaderInfo, SPIRV::SPIRVModule *module, Vkgc::ResourcesNodes &resourcesNodes,
       std::vector<ResourceNodeData> &inputSymbolInfo, std::vector<ResourceNodeData> &outputSymbolInfo,
       std::vector<ResourceNodeData> &uniformBufferInfo, std::vector<ResourceNodeData> &storageBufferInfo,
       std::vector<ResourceNodeData> &textureSymbolInfo, std::vector<ResourceNodeData> &imageSymbolInfo,
diff --git a/llpc/context/llpcComputeContext.cpp b/llpc/context/llpcComputeContext.cpp
index dccd98fc41..8f5ff5e4b6 100644
--- a/llpc/context/llpcComputeContext.cpp
+++ b/llpc/context/llpcComputeContext.cpp
@@ -41,12 +41,13 @@ namespace Llpc {
 // =====================================================================================================================
 //
 // @param gfxIp : Graphics Ip version info
+// @param apiName : API name from client, "Vulkan" or "OpenGL"
 // @param pipelineInfo : Compute pipeline build info
 // @param pipelineHash : Pipeline hash code
 // @param cacheHash : Cache hash code
-ComputeContext::ComputeContext(GfxIpVersion gfxIp, const ComputePipelineBuildInfo *pipelineInfo,
+ComputeContext::ComputeContext(GfxIpVersion gfxIp, const char *apiName, const ComputePipelineBuildInfo *pipelineInfo,
                                MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash)
-    : PipelineContext(gfxIp, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo) {
+    : PipelineContext(gfxIp, apiName, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo) {
   const Vkgc::BinaryData *gpurtShaderLibrary = nullptr;
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 62
   gpurtShaderLibrary = &pipelineInfo->shaderLibrary;
diff --git a/llpc/context/llpcComputeContext.h b/llpc/context/llpcComputeContext.h
index f554ff9f9d..9da9ade4a6 100644
--- a/llpc/context/llpcComputeContext.h
+++ b/llpc/context/llpcComputeContext.h
@@ -38,8 +38,8 @@ namespace Llpc {
 // Represents LLPC context for compute pipeline compilation. Derived from the base class Llpc::Context.
 class ComputeContext : public PipelineContext {
 public:
-  ComputeContext(GfxIpVersion gfxIp, const ComputePipelineBuildInfo *pipelineInfo, MetroHash::Hash *pipelineHash,
-                 MetroHash::Hash *cacheHash);
+  ComputeContext(GfxIpVersion gfxIp, const char *apiName, const ComputePipelineBuildInfo *pipelineInfo,
+                 MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash);
   virtual ~ComputeContext() = default;
 
   virtual PipelineType getPipelineType() const override { return PipelineType::Compute; }
diff --git a/llpc/context/llpcContext.cpp b/llpc/context/llpcContext.cpp
index 661fb3f60d..97e73f0016 100644
--- a/llpc/context/llpcContext.cpp
+++ b/llpc/context/llpcContext.cpp
@@ -40,7 +40,9 @@
 #include "SPIRVInternal.h"
 #include "llpcCompiler.h"
 #include "llpcDebug.h"
+#include "llpcDialect.h"
 #include "llpcPipelineContext.h"
+#include "llpcRayTracingContext.h"
 #include "llpcTimerProfiler.h"
 #include "vkgcMetroHash.h"
 #include "gfxruntime/GfxRuntimeLibrary.h"
@@ -95,7 +97,7 @@ namespace Llpc {
 Context::Context(GfxIpVersion gfxIp) : LLVMContext(), m_gfxIp(gfxIp) {
   m_dialectContext =
       llvm_dialects::DialectContext::make<LgcDialect, GpurtDialect, LgcRtDialect, LgcRtqDialect, LgcCpsDialect,
-                                          LgcIlCpsDialect, continuations::ContinuationsDialect>(*this);
+                                          LgcIlCpsDialect, LlpcDialect, continuations::ContinuationsDialect>(*this);
 
   reset();
 }
@@ -222,25 +224,55 @@ void Context::setModuleTargetMachine(Module *module) {
 }
 
 // =====================================================================================================================
-// Ensure that a compatible GPURT library module is attached to this context via GpurtContext.
-void Context::ensureGpurtLibrary() {
+// Compute the GPURT key for the current pipeline context.
+GpurtKey Context::buildGpurtKey() {
   // Check whether we already have a GPURT library module that can be used
   const Vkgc::RtState *rtState = getPipelineContext()->getRayTracingState();
-  auto &gpurtContext = lgc::GpurtContext::get(*this);
   GpurtKey key = {};
+  key.rtipVersion = rtState->rtIpVersion;
   key.gpurtFeatureFlags = rtState->gpurtFeatureFlags; // gpurtFeatureFlags affect which GPURT library we're using
-  key.hwIntersectRay = rtState->bvhResDesc.dataSizeInDwords > 0;
-
-  if (gpurtContext.ownedTheModule && key != m_currentGpurtKey) {
-    gpurtContext.theModule = nullptr;
-    gpurtContext.ownedTheModule.reset();
+  key.bvhResDesc.resize(rtState->bvhResDesc.dataSizeInDwords);
+  std::copy(rtState->bvhResDesc.descriptorData,
+            rtState->bvhResDesc.descriptorData + rtState->bvhResDesc.dataSizeInDwords, key.bvhResDesc.begin());
+
+  if (getPipelineType() == PipelineType::RayTracing) {
+    auto &rtContext = *static_cast<RayTracingContext *>(getPipelineContext());
+    const auto &rtPipelineBuildInfo = *rtContext.getRayTracingPipelineBuildInfo();
+    key.rtPipeline.valid = true;
+    key.rtPipeline.cpsFlags = rtPipelineBuildInfo.cpsFlags;
+    key.rtPipeline.options.resize(rtPipelineBuildInfo.gpurtOptionCount);
+    std::copy(rtPipelineBuildInfo.pGpurtOptions,
+              rtPipelineBuildInfo.pGpurtOptions + rtPipelineBuildInfo.gpurtOptionCount, key.rtPipeline.options.begin());
+
+    // Use a stable sort so that if an option is supplied multiple times, the last occurrence is guaranteed to win.
+    llvm::stable_sort(key.rtPipeline.options, [](const Vkgc::GpurtOption &lhs, const Vkgc::GpurtOption &rhs) {
+      return lhs.nameHash < rhs.nameHash;
+    });
   }
 
-  if (gpurtContext.theModule)
-    return;
+  return key;
+}
 
-  // Create the GPURT library module
-  m_currentGpurtKey = key;
+// =====================================================================================================================
+// Ensure that a compatible GPURT library module is attached to this context via GpurtContext.
+void Context::ensureGpurtLibrary() {
+  const Vkgc::RtState *rtState = getPipelineContext()->getRayTracingState();
+  auto &gpurtContext = lgc::GpurtContext::get(*this);
+
+  {
+    GpurtKey key = buildGpurtKey();
+
+    if (gpurtContext.ownedTheModule && !m_currentGpurtKey.refines(key)) {
+      gpurtContext.theModule = nullptr;
+      gpurtContext.ownedTheModule.reset();
+    }
+
+    if (gpurtContext.theModule)
+      return;
+
+    // Create the GPURT library module
+    m_currentGpurtKey = std::move(key);
+  }
 
   ShaderModuleData moduleData = {};
   moduleData.binCode = rtState->gpurtShaderLibrary;
@@ -257,7 +289,8 @@ void Context::ensureGpurtLibrary() {
   shaderInfo.pModuleData = &moduleData;
 
   // Disable fast math contract on OpDot when there is no hardware intersectRay
-  shaderInfo.options.noContractOpDot = !key.hwIntersectRay;
+  bool hwIntersectRay = !m_currentGpurtKey.bvhResDesc.empty();
+  shaderInfo.options.noContractOpDot = !hwIntersectRay;
 
   auto gpurt = std::make_unique<Module>("_cs_", *this);
   setModuleTargetMachine(gpurt.get());
@@ -277,7 +310,7 @@ void Context::ensureGpurtLibrary() {
   }
 
   lowerPassMgr->addPass(LowerCfgMerges());
-  lowerPassMgr->addPass(ProcessGpuRtLibrary());
+  lowerPassMgr->addPass(ProcessGpuRtLibrary(m_currentGpurtKey));
   lowerPassMgr->addPass(AlwaysInlinerPass());
   lowerPassMgr->addPass(LowerAccessChain());
   lowerPassMgr->addPass(LowerGlobals());
diff --git a/llpc/context/llpcContext.h b/llpc/context/llpcContext.h
index 87ff860b3f..9b3448d14b 100644
--- a/llpc/context/llpcContext.h
+++ b/llpc/context/llpcContext.h
@@ -30,6 +30,7 @@
  */
 #pragma once
 
+#include "ProcessGpuRtLibrary.h"
 #include "llpcPipelineContext.h"
 #include "spirvExt.h"
 #include "lgc/LgcContext.h"
@@ -121,6 +122,7 @@ class Context : public llvm::LLVMContext {
   // Sets triple and data layout in specified module from the context's target machine.
   void setModuleTargetMachine(llvm::Module *module);
 
+  GpurtKey buildGpurtKey();
   void ensureGpurtLibrary();
   void ensureGfxRuntimeLibrary();
 
@@ -148,16 +150,6 @@ class Context : public llvm::LLVMContext {
 
   unsigned m_useCount = 0; // Number of times this context is used.
 
-  struct GpurtKey {
-    unsigned gpurtFeatureFlags;
-    bool hwIntersectRay;
-
-    bool operator==(const GpurtKey &other) const {
-      return gpurtFeatureFlags == other.gpurtFeatureFlags && hwIntersectRay == other.hwIntersectRay;
-    }
-    bool operator!=(const GpurtKey &other) const { return !(*this == other); }
-  };
-
   GpurtKey m_currentGpurtKey;
 };
 
diff --git a/llpc/context/llpcDialect.cpp b/llpc/context/llpcDialect.cpp
new file mode 100644
index 0000000000..9918fd9c8c
--- /dev/null
+++ b/llpc/context/llpcDialect.cpp
@@ -0,0 +1,36 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  LlpcDialect.cpp
+ * @brief Implementation of the LLPC dialect definition
+ ***********************************************************************************************************************
+ */
+
+#include "llpcDialect.h"
+
+#define GET_INCLUDES
+#define GET_DIALECT_DEFS
+#include "context/LlpcDialect.cpp.inc"
diff --git a/llpc/context/llpcDialect.h b/llpc/context/llpcDialect.h
index 1bc9471c52..5ba32ee0d9 100644
--- a/llpc/context/llpcDialect.h
+++ b/llpc/context/llpcDialect.h
@@ -35,3 +35,7 @@ namespace LlpcName {
 const static char SpirvCooperativeMatrixProxy[] = "spirv.cooperative.matrix.proxy";
 
 } // namespace LlpcName
+
+#define GET_INCLUDES
+#define GET_DIALECT_DECLS
+#include "LlpcDialect.h.inc"
diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp
index 18492fd680..c736b8014f 100644
--- a/llpc/context/llpcGraphicsContext.cpp
+++ b/llpc/context/llpcGraphicsContext.cpp
@@ -51,12 +51,13 @@ static cl::opt<bool> DisableColorExportShader("disable-color-export-shader", cl:
 // =====================================================================================================================
 //
 // @param gfxIp : Graphics Ip version info
+// @param apiName : API name from client, "Vulkan" or "OpenGL"
 // @param pipelineInfo : Graphics pipeline build info
 // @param pipelineHash : Pipeline hash code
 // @param cacheHash : Cache hash code
-GraphicsContext::GraphicsContext(GfxIpVersion gfxIp, const GraphicsPipelineBuildInfo *pipelineInfo,
+GraphicsContext::GraphicsContext(GfxIpVersion gfxIp, const char *apiName, const GraphicsPipelineBuildInfo *pipelineInfo,
                                  MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash)
-    : PipelineContext(gfxIp, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo), m_stageMask(0),
+    : PipelineContext(gfxIp, apiName, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo), m_stageMask(0),
       m_preRasterHasGs(false), m_activeStageCount(0) {
   const Vkgc::BinaryData *gpurtShaderLibrary = nullptr;
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 62
diff --git a/llpc/context/llpcGraphicsContext.h b/llpc/context/llpcGraphicsContext.h
index ac5515ca78..94a10d42cc 100644
--- a/llpc/context/llpcGraphicsContext.h
+++ b/llpc/context/llpcGraphicsContext.h
@@ -40,8 +40,8 @@ namespace Llpc {
 // Represents LLPC context for graphics pipeline compilation. Derived from the base class Llpc::Context.
 class GraphicsContext : public PipelineContext {
 public:
-  GraphicsContext(GfxIpVersion gfxIp, const GraphicsPipelineBuildInfo *pipelineInfo, MetroHash::Hash *pipelineHash,
-                  MetroHash::Hash *cacheHash);
+  GraphicsContext(GfxIpVersion gfxIp, const char *apiName, const GraphicsPipelineBuildInfo *pipelineInfo,
+                  MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash);
   virtual ~GraphicsContext() = default;
 
   virtual PipelineType getPipelineType() const override { return PipelineType::Graphics; }
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index 6799fd4ae1..605f35772d 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -138,10 +138,12 @@ namespace Llpc {
 // =====================================================================================================================
 //
 // @param gfxIp : Graphics IP version info
+// @param apiName : API name from client, "Vulkan" or "OpenGL"
 // @param pipelineHash : Pipeline hash code
 // @param cacheHash : Cache hash code
-PipelineContext::PipelineContext(GfxIpVersion gfxIp, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash)
-    : m_gfxIp(gfxIp), m_pipelineHash(*pipelineHash), m_cacheHash(*cacheHash) {
+PipelineContext::PipelineContext(GfxIpVersion gfxIp, const char *apiName, MetroHash::Hash *pipelineHash,
+                                 MetroHash::Hash *cacheHash)
+    : m_gfxIp(gfxIp), m_apiName(apiName), m_pipelineHash(*pipelineHash), m_cacheHash(*cacheHash) {
 }
 
 // =====================================================================================================================
@@ -220,7 +222,8 @@ void PipelineContext::setPipelineState(Pipeline *pipeline, Util::MetroHash64 *ha
   if (pipeline) {
     pipeline->set128BitCacheHash(get128BitCacheHashCode(),
                                  VersionTuple(LLPC_INTERFACE_MAJOR_VERSION, LLPC_INTERFACE_MINOR_VERSION));
-    pipeline->setClient("Vulkan");
+    assert(m_apiName);
+    pipeline->setClient(m_apiName);
     if (getPreRasterHasGs())
       pipeline->setPreRasterHasGs(true);
   }
@@ -296,6 +299,7 @@ Options PipelineContext::computePipelineOptions() const {
     }
   }
 
+  options.robustBufferAccess = getPipelineOptions()->robustBufferAccess;
   options.allowNullDescriptor = getPipelineOptions()->extendedRobustness.nullDescriptor;
   options.enableExtendedRobustBufferAccess = getPipelineOptions()->extendedRobustness.robustBufferAccess;
   options.disableImageResourceCheck = getPipelineOptions()->disableImageResourceCheck;
@@ -303,6 +307,7 @@ Options PipelineContext::computePipelineOptions() const {
   options.enableInterpModePatch = getPipelineOptions()->enableInterpModePatch;
   options.pageMigrationEnabled = getPipelineOptions()->pageMigrationEnabled;
   options.resourceLayoutScheme = static_cast<lgc::ResourceLayoutScheme>(getPipelineOptions()->resourceLayoutScheme);
+  options.optimizePointSizeWrite = getPipelineOptions()->optimizePointSizeWrite;
 
   // Driver report full subgroup lanes for compute shader, here we just set fullSubgroups as default options
   options.fullSubgroups = true;
diff --git a/llpc/context/llpcPipelineContext.h b/llpc/context/llpcPipelineContext.h
index 0a3d7caa4b..b30ba607ed 100644
--- a/llpc/context/llpcPipelineContext.h
+++ b/llpc/context/llpcPipelineContext.h
@@ -118,7 +118,7 @@ enum class PipelineType {
 // Represents pipeline-specific context for pipeline compilation, it is a part of LLPC context
 class PipelineContext {
 public:
-  PipelineContext(GfxIpVersion gfxIp, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash);
+  PipelineContext(GfxIpVersion gfxIp, const char *apiName, MetroHash::Hash *pipelineHash, MetroHash::Hash *cacheHash);
   virtual ~PipelineContext() = default;
 
   // Returns the pipeline type
@@ -244,6 +244,7 @@ class PipelineContext {
   virtual lgc::Options computePipelineOptions() const;
 
   GfxIpVersion m_gfxIp;                  // Graphics IP version info
+  const char *m_apiName;                 // API name from client, "Vulkan" or "OpenGL"
   MetroHash::Hash m_pipelineHash;        // Pipeline hash code
   MetroHash::Hash m_cacheHash;           // Cache hash code
   ResourceMappingData m_resourceMapping; // Contains resource mapping nodes and static descriptor values
diff --git a/llpc/context/llpcRayTracingContext.cpp b/llpc/context/llpcRayTracingContext.cpp
index acd4d69d4e..84e639fe4b 100644
--- a/llpc/context/llpcRayTracingContext.cpp
+++ b/llpc/context/llpcRayTracingContext.cpp
@@ -40,17 +40,19 @@ using namespace llvm;
 namespace Llpc {
 // =====================================================================================================================
 //
-// @param gfxIP : Graphics Ip version info
+// @param gfxIp : Graphics Ip version info
+// @param apiName : API name from client, "Vulkan" or "OpenGL"
 // @param pipelineInfo : Ray tracing pipeline build info
 // @param traceRayShaderInfo : Trace ray shader info
 // @param pipelineHash : Pipeline hash code
 // @param cacheHash : Cache hash code
-RayTracingContext::RayTracingContext(GfxIpVersion gfxIP, const RayTracingPipelineBuildInfo *pipelineInfo,
+RayTracingContext::RayTracingContext(GfxIpVersion gfxIp, const char *apiName,
+                                     const RayTracingPipelineBuildInfo *pipelineInfo,
                                      const PipelineShaderInfo *representativeShaderInfo, MetroHash::Hash *pipelineHash,
                                      MetroHash::Hash *cacheHash, unsigned indirectStageMask)
-    : PipelineContext(gfxIP, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo), m_representativeShaderInfo(),
-      m_linked(false), m_indirectStageMask(indirectStageMask), m_entryName(""), m_callableDataMaxSize(0),
-      m_rayFlagsKnownBits(std::nullopt) {
+    : PipelineContext(gfxIp, apiName, pipelineHash, cacheHash), m_pipelineInfo(pipelineInfo),
+      m_representativeShaderInfo(), m_linked(false), m_indirectStageMask(indirectStageMask), m_entryName(""),
+      m_callableDataMaxSize(0), m_rayFlagsKnownBits(std::nullopt) {
   const Vkgc::BinaryData *gpurtShaderLibrary = nullptr;
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 62
   gpurtShaderLibrary = &pipelineInfo->shaderTraceRay;
diff --git a/llpc/context/llpcRayTracingContext.h b/llpc/context/llpcRayTracingContext.h
index 12b77904ca..c36f8faa54 100644
--- a/llpc/context/llpcRayTracingContext.h
+++ b/llpc/context/llpcRayTracingContext.h
@@ -46,7 +46,7 @@ namespace Llpc {
 // Represents LLPC context for ray tracing pipeline compilation. Derived from the base class Llpc::Context.
 class RayTracingContext : public PipelineContext {
 public:
-  RayTracingContext(GfxIpVersion gfxIp, const RayTracingPipelineBuildInfo *pipelineInfo,
+  RayTracingContext(GfxIpVersion gfxIp, const char *apiName, const RayTracingPipelineBuildInfo *pipelineInfo,
                     const PipelineShaderInfo *representativeShaderInfo, MetroHash::Hash *pipelineHash,
                     MetroHash::Hash *cacheHash, unsigned indirectStageMask);
   virtual ~RayTracingContext() = default;
diff --git a/llpc/docs/DdnPackShaderInputOutput.md b/llpc/docs/DdnPackShaderInputOutput.md
index 4fb52204ea..a1bd464ebe 100644
--- a/llpc/docs/DdnPackShaderInputOutput.md
+++ b/llpc/docs/DdnPackShaderInputOutput.md
@@ -81,7 +81,7 @@ layout(location = 7) in float16_t v8;
             export.generic.*.v4f32                            |
                        |                                      |
                        ----------------------------------------
-                       |       (PatchInOutImportExport pass)  |
+                       |       (LowerInOut pass)              |
         exp.f32(i32 immarg 32, i32 immarg 15)       interp.p1.f16(,,,highHalf,)
         exp.f32(i32 immarg 33, i32 immarg 15)       interp.p2.f16(,,,highHalf,)
         exp.f32(i32 immarg 34, i32 immarg 15)       interp.mov.* (ubfe for 16-bit)
@@ -106,7 +106,7 @@ union InOutLocationInfo {
   uint16_t u16All;
 };
 
-// In llpcPatchResourceCollect.h
+// In CollectResourceUsage.h
 // Represents the wrapper of input/output location info, along with handlers
 struct InOutLocation {
   uint16_t asIndex() const { return locationInfo.u16All; }
@@ -197,7 +197,7 @@ Scalarization is done before processShader() in the resource collect pass. Hence
 ### 4.5 Re-vectorization
 Fragment shader input instructions do not benefit from re-vectorization.
 Output instructions of the previous shader stage do benefit from re-vectorization, since export instructions are expensive. In general, we should strive to combine export instructions as much as possible. Furthermore, parameter exports should generally be done at the end of the hardware vertex or primitive shader stage.
-Therefore, this part of the feature changes the PatchInOutImportExport to handle exporting of generic outputs from the last geometry stage differently as follows:
+Therefore, this part of the feature changes the LowerInOut to handle exporting of generic outputs from the last geometry stage differently as follows:
 1. For every generic output component, insert an alloca instruction in the function entry block.
 2. Lower all output.export.generic calls into stores to the corresponding alloca'd variable.
 3. Build export intrinsics for all alloca'd variables before the function's return statement.
diff --git a/llpc/include/LlpcDialect.td b/llpc/include/LlpcDialect.td
new file mode 100644
index 0000000000..491cac045b
--- /dev/null
+++ b/llpc/include/LlpcDialect.td
@@ -0,0 +1,53 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+include "llvm-dialects/Dialect/Dialect.td"
+
+def LlpcDialect : Dialect {
+  let name = "llpc";
+  let cppNamespace = "Llpc";
+}
+
+def ConstantPointer : TgConstant<(PointerType 4)>, Type;
+def PrivatePointer : TgConstant<(PointerType 5)>, Type;
+def BufferPointer : TgConstant<(PointerType 7)>, Type;
+
+class LlpcOp<string mnemonic_, list<Trait> traits_ = []>
+    : Op<LlpcDialect, mnemonic_, traits_ # [NoUnwind]>;
+
+def StructuralGepOp : LlpcOp<"structural.gep", [Memory<[]>, WillReturn]> {
+  let arguments = (ins PointerType:$base_pointer, type:$base_type, AttrI1:$inbound, varargs:$indices);
+  let results = (outs (eq $base_pointer):$result);
+
+  let summary = "GEP a pointer from a base object pointer";
+  let description = [{
+    Returns a opaque in/out pointer from a base object pointer through structural gep of indices.
+
+    'base_pointer' is the opaque pointer of base.
+    'type' is the type of in/out data type.
+    'inbound' is the bool attribute mark if access chain is in bounds.
+    'indices' is indices from spir-v.
+  }];
+}
diff --git a/llpc/include/llpc.h b/llpc/include/llpc.h
index 86eb8986de..ed481f5aea 100644
--- a/llpc/include/llpc.h
+++ b/llpc/include/llpc.h
@@ -98,7 +98,9 @@ using Vkgc::StaticDescriptorValue;
 using Vkgc::WaveBreakSize;
 
 static const unsigned MaxViewports = 16;
+
 static const char VkIcdName[] = "amdvlk";
+static const char VkCompilerName[] = "amdllpc";
 
 /// Represents per shader module options.
 struct ShaderModuleOptions {
diff --git a/llpc/lowering/LowerAccessChain.cpp b/llpc/lowering/LowerAccessChain.cpp
index 3f9756d1c3..5335ccb4ce 100644
--- a/llpc/lowering/LowerAccessChain.cpp
+++ b/llpc/lowering/LowerAccessChain.cpp
@@ -30,8 +30,12 @@
  */
 #include "LowerAccessChain.h"
 #include "SPIRVInternal.h"
+#include "llpcDialect.h"
 #include "lgc/Builder.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <stack>
@@ -57,6 +61,11 @@ PreservedAnalyses LowerAccessChain::run(Module &module, ModuleAnalysisManager &a
   // Invoke handling of "getelementptr", "load" and "store" instructions
   visit(m_module);
 
+  // Remove dead "getelementptr" and custom "gep"
+  for (auto *inst : m_removeGeps)
+    inst->eraseFromParent();
+  m_removeGeps.clear();
+
   return PreservedAnalyses::none();
 }
 
@@ -68,33 +77,34 @@ PreservedAnalyses LowerAccessChain::run(Module &module, ModuleAnalysisManager &a
 // One of the examples may be a type in which we have a multiple nested structures.
 // { { [4 x float] } }
 //
-// @param gep : Getelementptr instruction.
-void LowerAccessChain::tryToAddMissingIndicesBetweenGVandGEP(GEPOperator *gep) {
+// @param gep : Custom structural gep instruction.
+void LowerAccessChain::tryToAddMissingIndicesBetweenGVandGEP(CallInst *callInst) {
+  auto *gep = cast<StructuralGepOp>(callInst);
 
   // We are interested only in address spaces which are used while doing global value lowering for store and load.
-  const unsigned addrSpace = gep->getType()->getPointerAddressSpace();
-  if (addrSpace != SPIRAS_Input && addrSpace != SPIRAS_Output)
-    return;
+  Value *base = gep->getBasePointer();
+  [[maybe_unused]] const unsigned addrSpace = base->getType()->getPointerAddressSpace();
+  assert(addrSpace == SPIRAS_Input || addrSpace == SPIRAS_Output);
 
-  GlobalValue *gv = dyn_cast<GlobalValue>(gep->getPointerOperand());
+  GlobalValue *gv = dyn_cast<GlobalValue>(base);
   if (!gv)
     return;
 
   // No missing indices, types are the same.
-  if (gep->getSourceElementType() == gv->getValueType())
+  Type *baseType = gep->getBaseType();
+  if (baseType == gv->getValueType())
     return;
 
   SmallVector<Value *, 8> idxs;
   idxs.push_back(m_builder->getInt32(0));
-  appendZeroIndexToMatchTypes(idxs, gep->getSourceElementType(), gv->getValueType());
+  appendZeroIndexToMatchTypes(idxs, baseType, gv->getValueType());
 
-  for (unsigned i = 2; i != gep->getNumOperands(); ++i)
-    idxs.push_back(gep->getOperand(i));
+  for (auto *idx : gep->getIndices())
+    idxs.push_back(idx);
 
-  Value *newGep = m_builder->CreateGEP(gv->getValueType(), gv, idxs);
+  Value *newGep = m_builder->create<StructuralGepOp>(gv, gv->getValueType(), gep->getInbound(), idxs);
   gep->replaceAllUsesWith(newGep);
-  if (Instruction *inst = dyn_cast<Instruction>(gep))
-    inst->eraseFromParent();
+  m_removeGeps.emplace_back(gep);
 }
 
 // =====================================================================================================================
@@ -102,7 +112,7 @@ void LowerAccessChain::tryToAddMissingIndicesBetweenGVandGEP(GEPOperator *gep) {
 //
 // @param loadInst : "Load" instruction
 void LowerAccessChain::visitLoadInst(LoadInst &loadInst) {
-  if (GEPOperator *gep = dyn_cast<GEPOperator>(loadInst.getPointerOperand())) {
+  if (auto *gep = dyn_cast<StructuralGepOp>(loadInst.getPointerOperand())) {
     m_builder->SetInsertPoint(&loadInst);
     tryToAddMissingIndicesBetweenGVandGEP(gep);
   }
@@ -113,7 +123,7 @@ void LowerAccessChain::visitLoadInst(LoadInst &loadInst) {
 //
 // @param storeInst : "Store" instruction
 void LowerAccessChain::visitStoreInst(StoreInst &storeInst) {
-  if (GEPOperator *gep = dyn_cast<GEPOperator>(storeInst.getPointerOperand())) {
+  if (auto *gep = dyn_cast<StructuralGepOp>(storeInst.getPointerOperand())) {
     m_builder->SetInsertPoint(&storeInst);
     tryToAddMissingIndicesBetweenGVandGEP(gep);
   }
@@ -127,18 +137,31 @@ void LowerAccessChain::visitGetElementPtrInst(GetElementPtrInst &getElemPtrInst)
   // NOTE: Here, we try to coalesce chained "getelementptr" instructions (created from multi-level access chain).
   // Because the metadata is always decorated on top-level pointer value (actually a global variable).
   const unsigned addrSpace = getElemPtrInst.getType()->getPointerAddressSpace();
-  if (addrSpace == SPIRAS_Private || addrSpace == SPIRAS_Input || addrSpace == SPIRAS_Output) {
-    GetElementPtrInst *gep = tryToCoalesceChain(&getElemPtrInst, addrSpace);
-    if (GEPOperator *gepOp = dyn_cast<GEPOperator>(gep)) {
-      m_builder->SetInsertPoint(gep);
-      tryToAddMissingIndicesBetweenGVandGEP(gepOp);
-    }
+  assert(addrSpace != SPIRAS_Input && addrSpace != SPIRAS_Output);
+  if (addrSpace == SPIRAS_Private) {
+    m_builder->SetInsertPoint(&getElemPtrInst);
+    tryToCoalesceChain(&getElemPtrInst);
   }
 }
 
 // =====================================================================================================================
-// Tries to coalesce chained "getelementptr" instructions (created from multi-level access chain) from bottom to top
-// in the type hierarchy.
+// Visits custom "getelementptr" instruction.
+//
+// @param getElemPtrInst : Custom "Getelementptr" instruction
+void LowerAccessChain::visitCallInst(CallInst &callInst) {
+  auto *structuralGep = dyn_cast<StructuralGepOp>(&callInst);
+  if (!structuralGep)
+    return;
+  [[maybe_unused]] const unsigned addrSpace = structuralGep->getBasePointer()->getType()->getPointerAddressSpace();
+  assert(addrSpace == SPIRAS_Input || addrSpace == SPIRAS_Output);
+  m_builder->SetInsertPoint(&callInst);
+  auto *gep = tryToCoalesceChain(structuralGep);
+  tryToAddMissingIndicesBetweenGVandGEP(cast<StructuralGepOp>(gep));
+}
+
+// =====================================================================================================================
+// Tries to coalesce chained custom GEP or "gelelementptr" instructions (created from multi-level access chain) from
+// bottom to top in the type hierarchy.
 //
 // e.g.
 //      %x = getelementptr %blockType, %blockType addrspace(N)* @block, i32 0, i32 L, i32 M
@@ -149,105 +172,112 @@ void LowerAccessChain::visitGetElementPtrInst(GetElementPtrInst &getElemPtrInst)
 //      %y = getelementptr %blockType, %blockType addrspace(N)* @block, i32 0, i32 L, i32 M, i32 N
 //
 //
-// @param getElemPtr : "getelementptr" instruction in the bottom to do coalescing
-// @param addrSpace : Address space of the pointer value of "getelementptr"
-GetElementPtrInst *LowerAccessChain::tryToCoalesceChain(GetElementPtrInst *getElemPtr, unsigned addrSpace) {
-  GetElementPtrInst *coalescedGetElemPtr = getElemPtr;
+// @param getElemPtr : "getelementptr" or custom "gep" instruction in the bottom to do coalescing
+Instruction *LowerAccessChain::tryToCoalesceChain(Instruction *getElemPtr) {
+  const bool isCustomGep = isa<StructuralGepOp>(getElemPtr);
+  auto getBasePointer = [=](Operator *gep) {
+    return isCustomGep ? cast<StructuralGepOp>(gep)->getBasePointer() : cast<GEPOperator>(gep)->getPointerOperand();
+  };
+  auto getBaseType = [=](Operator *gep) {
+    return isCustomGep ? cast<StructuralGepOp>(gep)->getBaseType() : cast<GEPOperator>(gep)->getSourceElementType();
+  };
+  auto getIndices = [=]<typename UnaryFunc>(Operator *gep, UnaryFunc &&stlRangeOp) {
+    using IterTy = decltype(std::declval<StructuralGepOp>().getIndices().begin());
+    auto range = isCustomGep ? cast<StructuralGepOp>(gep)->getIndices()
+                             : llvm::make_range(static_cast<IterTy>(cast<GEPOperator>(gep)->indices().begin()),
+                                                static_cast<IterTy>(cast<GEPOperator>(gep)->indices().end()));
+    return stlRangeOp(range);
+  };
 
-  std::stack<GEPOperator *> chainedInsts;       // Order: from top to bottom
-  std::stack<GetElementPtrInst *> removedInsts; // Order: from bottom to top
+  std::stack<Operator *> chainedInsts;    // Order: from top to bottom
+  std::stack<Instruction *> removedInsts; // Order: from bottom to top
 
-  // Collect chained "getelementptr" instructions and constants from bottom to top.
-  auto ptrVal = cast<GEPOperator>(getElemPtr);
+  // Collect chained "getelementptr" or custom "gep" instructions and constants from bottom to top.
+  auto *ptrVal = cast<Operator>(getElemPtr);
   for (;;) {
     chainedInsts.push(ptrVal);
-    ptrVal = dyn_cast<GEPOperator>(ptrVal->getOperand(0));
-    if (!ptrVal)
+    auto *basePointer = getBasePointer(ptrVal);
+    if (!isa<StructuralGepOp>(basePointer) && !isa<GEPOperator>(basePointer))
       break;
+    assert((isa<StructuralGepOp>(basePointer) && isCustomGep) || (isa<GEPOperator>(basePointer) && !isCustomGep));
+    ptrVal = cast<Operator>(basePointer);
   }
 
+  if (chainedInsts.size() <= 1)
+    return getElemPtr;
+
   // If there are more than one "getelementptr" instructions/constants, do coalescing
-  if (chainedInsts.size() > 1) {
-    SmallVector<Value *, 8> idxs;
-    unsigned startOperand = 1;
-    Value *basePtr = nullptr;
-    Type *coalescedType = nullptr;
-
-    do {
-      ptrVal = chainedInsts.top();
-      chainedInsts.pop();
-
-      if (coalescedType) {
-        Type *currentLevelGEPSourceType = ptrVal->getSourceElementType();
-        Type *oneLevelAboveGEPRetType = GetElementPtrInst::getIndexedType(coalescedType, idxs);
-        if (currentLevelGEPSourceType != oneLevelAboveGEPRetType) {
-          // For Opaque Pointers some of GEPs (all zero-index) will be removed and since Source Type of the coalesced
-          // GEP is equal to the top of chained GEPs, this will lead to accessing wrong place in memory.
-          //
-          // Example:
-          // %1 = getelementptr { i64, [3 x [4 x { <3 x i32>, <3 x i32> }]], [3 x [4 x i32]] }, ptr
-          // addrspace(5) %381, i32 0, i32 1
-          //
-          // %2 = getelementptr [3 x [4 x { <3 x i32>, <3 x i32> }]], ptr addrspace(5) %1, i32 0, i32 0
-          // ^^^ all zero-index GEP, lack of this instruction for opaque pointers
-          //
-          // %3 = getelementptr [4 x { <3 x i32>, <3 x i32> }], ptraddrspace(5) %2, i32 0, i32 0
-          // ^^^ all zero-index GEP, lack of this instruction for opaque pointers
-          //
-          // %4 = getelementptr { <3 x i32>, <3 x i32> }, ptr addrspace(5) %3, i32 0, i32 1
-          //
-          //
-          // Result after Lower Access Chain:
-          //
-          // In case of non opaque pointers
-          // %5 = getelementptr { i64, [3 x [4 x { <3 x i32>, <3 x i32> }]], [3 x [4 x i32]] }, ptr
-          // addrspace(5) %381, i32 0, i32 1, i32 0, i32 0, i32 1
-          //
-          // For opaque pointers
-          // %5 = getelementptr { i64, [3 x [4 x { <3 x i32>, <3 x i32> }]], [3 x [4 x i32]] }, ptr
-          // addrspace(5) %381, i32 0, i32 1, i32 1
-          //
-          // We need to compare two chained GEP instructions and see if return Type of one is the same as Source
-          // Type of the other. If Types are not the same than we need to add
-          // missing zero-index elements to the "idxs" which are used to create new (coalesced) GEP instruction.
-          appendZeroIndexToMatchTypes(idxs, currentLevelGEPSourceType, oneLevelAboveGEPRetType);
-        }
-      }
-
-      for (unsigned i = startOperand; i != ptrVal->getNumOperands(); ++i)
-        idxs.push_back(ptrVal->getOperand(i));
-      // NOTE: For subsequent "getelementptr" instructions/constants, we skip the first two operands. The first
-      // operand is the pointer value from which the element pointer is constructed. And the second one is always
-      // 0 to dereference the pointer value.
-      startOperand = 2;
-
-      if (!basePtr) {
-        basePtr = ptrVal->getOperand(0);
-        coalescedType = ptrVal->getSourceElementType();
-      }
-
-      if (auto inst = dyn_cast<GetElementPtrInst>(ptrVal))
-        removedInsts.push(inst);
-    } while (!chainedInsts.empty());
-
-    // Create the coalesced "getelementptr" instruction (do combining)
-    coalescedGetElemPtr = GetElementPtrInst::Create(coalescedType, basePtr, idxs, "", getElemPtr);
-    getElemPtr->replaceAllUsesWith(coalescedGetElemPtr);
-
-    // Remove dead "getelementptr" instructions where possible.
-    while (!removedInsts.empty()) {
-      GetElementPtrInst *inst = removedInsts.top();
-      if (inst->user_empty()) {
-        if (inst == getElemPtr) {
-          // We cannot remove the current instruction that InstWalker is on. Just stop it using its
-          // pointer operand, and it will be DCEd later.
-          auto &operand = inst->getOperandUse(0);
-          operand = PoisonValue::get(operand->getType());
-        } else
-          inst->eraseFromParent();
-      }
-      removedInsts.pop();
+  SmallVector<Value *, 8> indices;
+  Value *basePtr = getBasePointer(chainedInsts.top());
+  Type *coalescedType = getBaseType(chainedInsts.top());
+
+  while (!chainedInsts.empty()) {
+    ptrVal = chainedInsts.top();
+    chainedInsts.pop();
+
+    Type *currentLevelGEPSourceType = getBaseType(ptrVal);
+    Type *oneLevelAboveGEPRetType = GetElementPtrInst::getIndexedType(coalescedType, indices);
+    if (currentLevelGEPSourceType != oneLevelAboveGEPRetType) {
+      // For Opaque Pointers some of GEPs (all zero-index) will be removed and since Source Type of the coalesced
+      // GEP is equal to the top of chained GEPs, this will lead to accessing wrong place in memory.
+      //
+      // Example:
+      // %1 = getelementptr { i64, [3 x [4 x { <3 x i32>, <3 x i32> }]], [3 x [4 x i32]] }, ptr
+      // addrspace(5) %381, i32 0, i32 1
+      //
+      // %2 = getelementptr [3 x [4 x { <3 x i32>, <3 x i32> }]], ptr addrspace(5) %1, i32 0, i32 0
+      // ^^^ all zero-index GEP, lack of this instruction for opaque pointers
+      //
+      // %3 = getelementptr [4 x { <3 x i32>, <3 x i32> }], ptraddrspace(5) %2, i32 0, i32 0
+      // ^^^ all zero-index GEP, lack of this instruction for opaque pointers
+      //
+      // %4 = getelementptr { <3 x i32>, <3 x i32> }, ptr addrspace(5) %3, i32 0, i32 1
+      //
+      //
+      // Result after Lower Access Chain:
+      //
+      // In case of non opaque pointers
+      // %5 = getelementptr { i64, [3 x [4 x { <3 x i32>, <3 x i32> }]], [3 x [4 x i32]] }, ptr
+      // addrspace(5) %381, i32 0, i32 1, i32 0, i32 0, i32 1
+      //
+      // For opaque pointers
+      // %5 = getelementptr { i64, [3 x [4 x { <3 x i32>, <3 x i32> }]], [3 x [4 x i32]] }, ptr
+      // addrspace(5) %381, i32 0, i32 1, i32 1
+      //
+      // We need to compare two chained GEP instructions and see if return Type of one is the same as Source
+      // Type of the other. If Types are not the same than we need to add
+      // missing zero-index elements to the "idxs" which are used to create new (coalesced) GEP instruction.
+      appendZeroIndexToMatchTypes(indices, currentLevelGEPSourceType, oneLevelAboveGEPRetType);
+    }
+
+    // NOTE: For subsequent "getelementptr" instructions/constants, we skip the first index due to it's always 0 to
+    // dereference the pointer value.
+    const unsigned skipCount = basePtr == getBasePointer(ptrVal) ? 0 : 1;
+    for (auto *idx : getIndices(ptrVal, [=](auto range) {
+           assert(llvm::range_size(range) > 0);
+           return llvm::drop_begin(range, skipCount);
+         }))
+      indices.emplace_back(idx);
+
+    assert(isa<GetElementPtrInst>(ptrVal) || isa<StructuralGepOp>(ptrVal));
+    removedInsts.push(cast<Instruction>(ptrVal));
+  }
+
+  // Create the coalesced "getelementptr" instruction (do combining)
+  auto *coalescedGetElemPtr =
+      isCustomGep ? cast<Instruction>(m_builder->create<StructuralGepOp>(basePtr, coalescedType, false, indices))
+                  : cast<Instruction>(GetElementPtrInst::Create(coalescedType, basePtr, indices, "", getElemPtr));
+  getElemPtr->replaceAllUsesWith(coalescedGetElemPtr);
+
+  // Remove dead "getelementptr" instructions where possible.
+  while (!removedInsts.empty()) {
+    Instruction *inst = removedInsts.top();
+    if (inst->user_empty()) {
+      auto *poison = PoisonValue::get(getBasePointer(cast<Operator>(inst))->getType());
+      inst->setOperand(0, poison);
+      m_removeGeps.emplace_back(inst);
     }
+    removedInsts.pop();
   }
 
   return coalescedGetElemPtr;
diff --git a/llpc/lowering/LowerAccessChain.h b/llpc/lowering/LowerAccessChain.h
index 58fd43d875..683af15a24 100644
--- a/llpc/lowering/LowerAccessChain.h
+++ b/llpc/lowering/LowerAccessChain.h
@@ -32,7 +32,6 @@
 
 #include "Lowering.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
@@ -47,15 +46,18 @@ class LowerAccessChain : public SpirvLower,
   virtual void visitGetElementPtrInst(llvm::GetElementPtrInst &getElemPtrInst);
   virtual void visitLoadInst(llvm::LoadInst &loadInst);
   virtual void visitStoreInst(llvm::StoreInst &storeInst);
+  virtual void visitCallInst(llvm::CallInst &callInst);
 
   static llvm::StringRef name() { return "Lower SPIR-V access chain"; }
 
 private:
-  llvm::GetElementPtrInst *tryToCoalesceChain(llvm::GetElementPtrInst *getElemPtr, unsigned addrSpace);
+  llvm::Instruction *tryToCoalesceChain(llvm::Instruction *getElemPtr);
   void appendZeroIndexToMatchTypes(llvm::SmallVectorImpl<llvm::Value *> &indexOperands, llvm::Type *typeToMatch,
                                    llvm::Type *baseType);
 
-  void tryToAddMissingIndicesBetweenGVandGEP(llvm::GEPOperator *gep);
+  void tryToAddMissingIndicesBetweenGVandGEP(llvm::CallInst *callInst);
+
+  llvm::SmallVector<llvm::Instruction *, 8> m_removeGeps;
 };
 
 } // namespace Llpc
diff --git a/llpc/lowering/LowerGlCompatibility.cpp b/llpc/lowering/LowerGlCompatibility.cpp
index 179812a755..bcef28181c 100644
--- a/llpc/lowering/LowerGlCompatibility.cpp
+++ b/llpc/lowering/LowerGlCompatibility.cpp
@@ -32,9 +32,11 @@
 #include "LoweringUtil.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
+#include "llpcDialect.h"
 #include "llpcGraphicsContext.h"
 #include "lgc/Builder.h"
 #include "lgc/Pipeline.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/DerivedTypes.h"
 
 #define DEBUG_TYPE "lower-gl-compatibility"
@@ -346,10 +348,7 @@ void LowerGlCompatibility::collectEmulationResource() {
       for (auto md : mds) {
         if (md.IsLoc) {
           if (md.Value == Vkgc::GlCompatibilityInOutLocation::ClipVertex) {
-            if (isStructureOrArrayOfStructure)
-              m_out = &global;
-            else
-              m_clipVertex = &global;
+            m_clipVertex = &global;
           }
           if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontColor) {
             if (isStructureOrArrayOfStructure)
@@ -404,31 +403,17 @@ void LowerGlCompatibility::collectEmulationResource() {
       // Check to see if the value has been stored.
       bool beenModified = false;
       User *gep = nullptr;
-      if (auto *gepConst = dyn_cast<ConstantExpr>(user)) {
-        auto operandsCount = gepConst->getNumOperands();
-        // Skip the first indices, and the access chain target.
-        for (size_t index = 2; index < operandsCount; index++) {
-          auto *pIndex = dyn_cast<ConstantInt>(gepConst->getOperand(index));
-          if (pIndex) {
-            indexOperands.push_back(pIndex);
-          }
-        }
-        gep = gepConst;
-      } else if (auto *gepInst = dyn_cast<GetElementPtrInst>(user)) {
+      assert(!isa<ConstantExpr>(user) && !isa<GetElementPtrInst>(user));
+      if (auto *gepInst = dyn_cast<StructuralGepOp>(user)) {
         // We shouldn't have any chained GEPs here, they are coalesced by the LowerAccessChain pass.
-        for (auto index = gepInst->idx_begin(); index != gepInst->idx_end(); index++) {
-          // Skip the first indices, it should be 0 in most of time.
-          if (index == gepInst->idx_begin()) {
-            assert(cast<ConstantInt>(gepInst->idx_begin())->isZero() && "Non-zero GEP first index\n");
-            continue;
-          }
-          indexOperands.push_back(m_builder->CreateZExtOrTrunc(index->get(), m_builder->getInt32Ty()));
-        }
+        assert(cast<ConstantInt>(*gepInst->getIndices().begin())->isZero() && "Non-zero GEP first index\n");
+        for (auto *idx : llvm::drop_begin(gepInst->getIndices()))
+          indexOperands.push_back(m_builder->CreateZExtOrTrunc(idx, m_builder->getInt32Ty()));
         gep = gepInst;
       }
       if (gep != nullptr) {
         for (User *gepUser : gep->users()) {
-          assert(!isa<GetElementPtrInst>(gepUser));
+          assert(!isa<StructuralGepOp>(gepUser));
           beenModified |= isa<StoreInst>(gepUser);
         }
         decodeInOutMetaRecursivelyByIndex(glOut->getValueType(), inOutMetaConst, indexOperands, mds);
diff --git a/llpc/lowering/LowerGlobals.cpp b/llpc/lowering/LowerGlobals.cpp
index 475fae5df8..74775d06b2 100644
--- a/llpc/lowering/LowerGlobals.cpp
+++ b/llpc/lowering/LowerGlobals.cpp
@@ -33,6 +33,7 @@
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
 #include "llpcDebug.h"
+#include "llpcDialect.h"
 #include "llpcGraphicsContext.h"
 #include "llpcRayTracingContext.h"
 #include "compilerutils/CompilerUtils.h"
@@ -396,11 +397,11 @@ void LowerGlobals::handleCallInst(bool checkEmitCall, bool checkInterpCall) {
 
           GlobalVariable *gv = nullptr;
           SmallVector<Value *, 6> indexOperands;
-          if (auto getElemPtr = dyn_cast<GEPOperator>(loadSrc)) {
+          if (auto getElemPtr = dyn_cast<StructuralGepOp>(loadSrc)) {
             // The interpolant is an element of the input
-            for (auto &index : getElemPtr->indices())
+            for (auto *index : getElemPtr->getIndices())
               indexOperands.push_back(m_builder->CreateZExtOrTrunc(index, m_builder->getInt32Ty()));
-            gv = cast<GlobalVariable>(getElemPtr->getPointerOperand());
+            gv = cast<GlobalVariable>(getElemPtr->getBasePointer());
           } else {
             gv = cast<GlobalVariable>(loadSrc);
           }
@@ -447,7 +448,10 @@ static bool hasPrimitiveIdx(const Constant &metaVal) {
   if (inOutMeta.IsBuiltIn) {
     unsigned builtInId = inOutMeta.Value;
     return (builtInId == spv::BuiltInPerPrimitive || builtInId == spv::BuiltInPrimitivePointIndicesEXT ||
-            builtInId == spv::BuiltInPrimitiveLineIndicesEXT || builtInId == spv::BuiltInPrimitiveTriangleIndicesEXT);
+            builtInId == spv::BuiltInPrimitiveLineIndicesEXT || builtInId == spv::BuiltInPrimitiveTriangleIndicesEXT ||
+            builtInId == spv::BuiltInPrimitiveId || builtInId == spv::BuiltInLayer || // HLSL style per-primitive data
+            builtInId == spv::BuiltInViewportIndex || builtInId == spv::BuiltInPrimitiveShadingRateKHR ||
+            builtInId == spv::BuiltInCullPrimitiveEXT);
   }
 
   return static_cast<bool>(inOutMeta.PerPrimitive);
@@ -556,6 +560,22 @@ void LowerGlobals::lowerInOut(llvm::GlobalVariable *globalVar) {
       }
     }
 
+    SmallVector<Value *, 8> indices;
+    // No longer necessary to keep the structural geps for In/Out variables, replace them with LLVM gep.
+    for (auto *user : llvm::make_early_inc_range(globalVar->users())) {
+      if (!isa<StructuralGepOp>(user))
+        continue;
+      auto *sGep = cast<StructuralGepOp>(user);
+      for (auto *idx : sGep->getIndices())
+        indices.emplace_back(idx);
+      // NOTE: FoldGEP (all zero-index) will be removed, causing `replaceAllPointerUses` crash. Please don't use builder
+      // interface, or fix the issue.
+      auto *gep = GetElementPtrInst::Create(sGep->getBaseType(), sGep->getBasePointer(), indices, "", sGep);
+      sGep->replaceAllUsesWith(gep);
+      sGep->eraseFromParent();
+      indices.clear();
+    }
+
     SmallVector<Instruction *> toErase;
     CompilerUtils::replaceAllPointerUses(m_builder, globalVar, proxy, toErase);
     for (auto inst : toErase)
@@ -580,13 +600,11 @@ void LowerGlobals::lowerInOutUsersInPlace(llvm::GlobalVariable *globalVar, llvm:
   for (User *user : llvm::make_early_inc_range(current->users())) {
     Instruction *inst = cast<Instruction>(user);
 
-    if (auto *gep = dyn_cast<GetElementPtrInst>(inst)) {
-      // TODO: As LLVM is moving away from GEPs towards ptradds, we need a better solution, probably by adding our
-      //       own "structured GEP" operation.
-      assert(cast<ConstantInt>(gep->idx_begin()[0])->isNullValue());
+    if (auto *gep = dyn_cast<StructuralGepOp>(inst)) {
+      assert(cast<ConstantInt>(*gep->getIndices().begin())->isNullValue());
 
-      for (unsigned i = 1, e = gep->getNumIndices(); i < e; ++i)
-        indexStack.push_back(m_builder->CreateZExtOrTrunc(gep->idx_begin()[i], m_builder->getInt32Ty()));
+      for (auto *idx : llvm::drop_begin(gep->getIndices()))
+        indexStack.push_back(m_builder->CreateZExtOrTrunc(idx, m_builder->getInt32Ty()));
 
       lowerInOutUsersInPlace(globalVar, gep, indexStack);
 
@@ -609,7 +627,8 @@ void LowerGlobals::lowerInOutUsersInPlace(llvm::GlobalVariable *globalVar, llvm:
       auto indexOperands = ArrayRef(indexStack);
 
       // If the input/output is arrayed, the outermost index might be used for vertex indexing
-      if (inOutTy->isArrayTy() && (hasVertexIdx(*inOutMetaVal) || hasPrimitiveIdx(*inOutMetaVal))) {
+      if (inOutTy->isArrayTy() &&
+          (hasVertexIdx(*inOutMetaVal) || (m_shaderStage == ShaderStageMesh && hasPrimitiveIdx(*inOutMetaVal)))) {
         if (!indexOperands.empty()) {
           vertexOrPrimitiveIdx = indexOperands.front();
           indexOperands = indexOperands.drop_front();
@@ -1530,9 +1549,10 @@ void LowerGlobals::lowerBufferBlock() {
           if (inst->getFunction() != func)
             continue;
 
-          if (auto *GEP = dyn_cast<GetElementPtrInst>(inst)) {
-            for (auto *gepUser : GEP->users())
-              worklist.push_back(gepUser);
+          // treat buffer index ops the same as geps
+          if (isa<GetElementPtrInst, lgc::BufferIndexOp>(inst)) {
+            for (auto *user : inst->users())
+              worklist.push_back(user);
             continue;
           }
 
diff --git a/llpc/lowering/Lowering.cpp b/llpc/lowering/Lowering.cpp
index 05f0e6400a..5e19b88650 100644
--- a/llpc/lowering/Lowering.cpp
+++ b/llpc/lowering/Lowering.cpp
@@ -74,7 +74,9 @@
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 511856
 #include "llvm/Transforms/Instrumentation.h"
+#endif
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
@@ -108,7 +110,7 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager
     LgcContext::createAndAddStartStopTimer(passMgr, lowerTimer, true);
 
   if (lowerFlag.isInternalRtShader)
-    passMgr.addPass(ProcessGpuRtLibrary());
+    passMgr.addPass(ProcessGpuRtLibrary(context->buildGpurtKey()));
 
   // Lower SPIR-V CFG merges before inlining
   passMgr.addPass(LowerCfgMerges());
diff --git a/llpc/lowering/ProcessGpuRtLibrary.cpp b/llpc/lowering/ProcessGpuRtLibrary.cpp
index 48dfb81d01..38de3cab2a 100644
--- a/llpc/lowering/ProcessGpuRtLibrary.cpp
+++ b/llpc/lowering/ProcessGpuRtLibrary.cpp
@@ -53,7 +53,7 @@ using namespace llvm;
 using namespace lgc::rt;
 
 namespace Llpc {
-ProcessGpuRtLibrary::ProcessGpuRtLibrary() {
+ProcessGpuRtLibrary::ProcessGpuRtLibrary(const GpurtKey &key) : m_gpurtKey(key) {
 }
 
 // =====================================================================================================================
@@ -65,6 +65,23 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager
   LLVM_DEBUG(dbgs() << "Run the pass Lower-gpurt-library\n");
   SpirvLower::init(&module);
 
+  // Imbue the module with settings from the GPURT key.
+  ContHelper::setStackAddrspace(module, m_gpurtKey.rtPipeline.cpsFlags & Vkgc::CpsFlag::CpsFlagStackInGlobalMem
+                                            ? ContStackAddrspace::GlobalLLPC
+                                            : ContStackAddrspace::ScratchLLPC);
+
+  // The version is encoded as <major><minor> in decimal digits, so 11 is rtip 1.1, 20 is rtip 2.0
+  ContHelper::setRtip(module, m_gpurtKey.rtipVersion.major * 10 + m_gpurtKey.rtipVersion.minor);
+
+  SmallVector<ContSetting> contSettings;
+  for (auto &option : m_gpurtKey.rtPipeline.options) {
+    ContSetting setting;
+    setting.NameHash = option.nameHash;
+    setting.Value = option.value;
+    contSettings.push_back(setting);
+  }
+  ContHelper::setGpurtSettings(module, contSettings);
+
   // Process each function.
   SmallVector<std::pair<Function *, SmallBitVector>> argPromotionsFuncs;
   SmallVector<Function *> maybeRtFuncs;
@@ -123,6 +140,9 @@ PreservedAnalyses ProcessGpuRtLibrary::run(Module &module, ModuleAnalysisManager
   for (Function *func : maybeRtFuncs)
     processLibraryFunction(func);
 
+  // Implement builtins whose implementation is generic, i.e. not specific to LGC.
+  earlyGpurtTransform(module);
+
   return PreservedAnalyses::none();
 }
 
@@ -141,6 +161,7 @@ ProcessGpuRtLibrary::LibraryFunctionTable::LibraryFunctionTable() {
   m_libFuncPtrs["AmdTraceRayGetTriangleCompressionMode"] = &ProcessGpuRtLibrary::createGetTriangleCompressionMode;
   m_libFuncPtrs["AmdExtD3DShaderIntrinsics_LoadDwordAtAddr"] = &ProcessGpuRtLibrary::createLoadDwordAtAddr;
   m_libFuncPtrs["AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx2"] = &ProcessGpuRtLibrary::createLoadDwordAtAddrx2;
+  m_libFuncPtrs["AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx3"] = &ProcessGpuRtLibrary::createLoadDwordAtAddrx3;
   m_libFuncPtrs["AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx4"] = &ProcessGpuRtLibrary::createLoadDwordAtAddrx4;
   m_libFuncPtrs["AmdExtD3DShaderIntrinsics_ConstantLoadDwordAtAddr"] =
       &ProcessGpuRtLibrary::createConstantLoadDwordAtAddr;
@@ -184,9 +205,8 @@ ProcessGpuRtLibrary::LibraryFunctionTable::LibraryFunctionTable() {
   m_libFuncPtrs["_AmdContStackFree"] = &ProcessGpuRtLibrary::createContStackFree;
   m_libFuncPtrs["_AmdContStackGetPtr"] = &ProcessGpuRtLibrary::createContStackGetPtr;
   m_libFuncPtrs["_AmdContStackSetPtr"] = &ProcessGpuRtLibrary::createContStackSetPtr;
-  m_libFuncPtrs["_AmdContinuationStackIsGlobal"] = &ProcessGpuRtLibrary::createContinuationStackIsGlobal;
-  m_libFuncPtrs["_AmdGetRtip"] = &ProcessGpuRtLibrary::createGetRtip;
   m_libFuncPtrs["_AmdIsLlpc"] = &ProcessGpuRtLibrary::createIsLlpc;
+  m_libFuncPtrs["_AmdGetShaderRecordIndex"] = &ProcessGpuRtLibrary::createGetShaderRecordIndex;
 }
 
 // =====================================================================================================================
@@ -209,25 +229,9 @@ void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
     m_builder->SetInsertPoint(clearBlock(func));
     createEnqueue(func);
     return;
-  } else if (funcName.starts_with("_AmdGetUninitialized")) {
-    m_builder->SetInsertPoint(clearBlock(func));
-    Value *FrozenPoison = m_builder->CreateFreeze(PoisonValue::get(func->getReturnType()));
-    m_builder->CreateRet(FrozenPoison);
-    return;
   } else if (funcName.starts_with("_AmdRestoreSystemData")) {
     // We don't need this, leave it as dummy function so that it does nothing.
     return;
-  } else if (funcName.starts_with("_AmdGetSetting")) {
-    auto rtContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
-    SmallVector<ContSetting> contSettings;
-    for (unsigned i = 0; i < rtContext->getRayTracingPipelineBuildInfo()->gpurtOptionCount; i++) {
-      ContSetting setting;
-      setting.NameHash = rtContext->getRayTracingPipelineBuildInfo()->pGpurtOptions[i].nameHash;
-      setting.Value = rtContext->getRayTracingPipelineBuildInfo()->pGpurtOptions[i].value;
-      contSettings.push_back(setting);
-    }
-    ContHelper::handleGetSetting(*func, contSettings);
-    return;
   } else if (funcName.starts_with("_AmdValueI32Count")) {
     ContHelper::handleValueI32Count(*func, *m_builder);
     return;
@@ -241,9 +245,6 @@ void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
     else
       ContHelper::handleValueSetI32(*newFunc, *m_builder);
     return;
-  } else if (funcName.starts_with("_AmdComplete")) {
-    ContHelper::handleComplete(*func);
-    return;
   }
 
   // Create implementation for intrinsic functions.
@@ -265,12 +266,15 @@ void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
     return;
   }
 
-  bool isAmdAwaitLike = funcName.starts_with("_AmdAwait") || funcName.starts_with("_AmdWaitAwait");
   // NOTE: GPURT now preserves all function names started with "_Amd", but some of them are not intrinsics, e.g.,
   // "_AmdSystemData.IsTraversal", which are methods of system data structs. Skip those to let them be inlined
   // automatically.
   bool isAmdIntrinsic = funcName.starts_with("_Amd") && !funcName.contains(".");
   if (funcName.starts_with("_cont_") || isAmdIntrinsic) {
+    // TODO: Once we remove createEnqueue, also handle _AmdEnqueue* and _AmdWaitEnqueue* here.
+    bool isAmdAwaitLike =
+        isAmdIntrinsic && (funcName.starts_with("_AmdAwait") || funcName.starts_with("_AmdWaitAwait"));
+
     // This function is provided by GPURT to the compiler.
     if (!isAmdIntrinsic)
       func->setLinkage(GlobalValue::WeakAnyLinkage);
@@ -289,7 +293,7 @@ void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
         continue;
 
       // Change the pointer type to its value type for non-struct types.
-      // Amd*Await, use value types for all arguments.
+      // Amd*Await use value types for all arguments.
       // For _cont_SetTriangleHitAttributes, we always use its value type for hitAttributes argument.
       if (!isa<StructType>(argTy.getPointerElementType()) || isAmdAwaitLike ||
           (funcName == ContDriverFunc::SetTriangleHitAttributesName && argNo == 1))
@@ -302,16 +306,22 @@ void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
     if (isAmdIntrinsic)
       newFunc->deleteBody();
 
-    if (newFunc->getName().starts_with("_AmdWaitAwait")) {
+    // Fixup WaitAwait by removing the wait mask, and fixup [Wait]AwaitTraversal by adding a dummy return address.
+    // AwaitTraversal doesn't have a return address in HLSL because the return address is written to system data.
+    bool isWaitAwait = newFunc->getName().starts_with("_AmdWaitAwait");
+    bool isNonWaitAwait = newFunc->getName().starts_with("_AmdAwait");
+    bool isAwaitTraversal = (isWaitAwait || isNonWaitAwait) && newFunc->getName().contains("Traversal");
+    if (isWaitAwait || isAwaitTraversal) {
       llvm::forEachCall(*newFunc, [&](CallInst &CInst) {
         SmallVector<Value *> args(CInst.args());
-        // NOTE: Theoretically we should remove the wait mask so that the function signature matches
-        // _AmdAwait*(addr, returnAddr, SystemData, ...). However, _AmdWaitAwaitTraversal's arguments are defined as
-        // (addr, waitMask, SystemData, ...), thus we need to keep the waitMask as a dummy returnAddr so that
-        // LowerRaytracingPipeline can handle it correctly.
-        if (!newFunc->getName().starts_with("_AmdWaitAwaitTraversal"))
+        // Remove wait mask
+        if (isWaitAwait)
           args.erase(args.begin() + 1);
 
+        // Add dummy return address
+        if (isAwaitTraversal)
+          args.insert(args.begin() + 1, PoisonValue::get(m_builder->getInt64Ty()));
+
         m_builder->SetInsertPoint(&CInst);
         auto *newValue = m_builder->CreateNamedCall("_AmdAwait", CInst.getType(), args, {});
         CInst.replaceAllUsesWith(newValue);
@@ -447,6 +457,15 @@ void ProcessGpuRtLibrary::createLoadDwordAtAddrx2(Function *func) {
   createLoadDwordAtAddrWithType(func, int32x2Ty, SPIRAS_Global);
 }
 
+// =====================================================================================================================
+// Fill in function to global load 3 dwords at given address
+//
+// @param func : The function to process
+void ProcessGpuRtLibrary::createLoadDwordAtAddrx3(Function *func) {
+  auto int32x3Ty = FixedVectorType::get(m_builder->getInt32Ty(), 3);
+  createLoadDwordAtAddrWithType(func, int32x3Ty, SPIRAS_Global);
+}
+
 // =====================================================================================================================
 // Fill in function to global load 4 dwords at given address
 //
@@ -550,9 +569,8 @@ void ProcessGpuRtLibrary::createConvertF32toF16WithRoundingMode(Function *func,
 //
 // @param func : The function to create
 void ProcessGpuRtLibrary::createIntersectBvh(Function *func) {
-  const auto *rtState = m_context->getPipelineContext()->getRayTracingState();
-  assert(rtState->bvhResDesc.dataSizeInDwords != 0);
-  if (rtState->bvhResDesc.dataSizeInDwords < 4)
+  assert(m_gpurtKey.bvhResDesc.size() != 0);
+  if (m_gpurtKey.bvhResDesc.size() < 4)
     return;
 
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 33
@@ -615,17 +633,15 @@ void ProcessGpuRtLibrary::createIntersectBvh(Function *func) {
 // @param expansion : Box expansion
 // @param boxSortMode : Box sort mode
 Value *ProcessGpuRtLibrary::createGetBvhSrd(llvm::Value *expansion, llvm::Value *boxSortMode) {
-  const auto *rtState = m_context->getPipelineContext()->getRayTracingState();
-  assert(rtState->bvhResDesc.dataSizeInDwords == 4);
+  assert(m_gpurtKey.bvhResDesc.size() == 4);
 
   // Construct image descriptor from rtstate.
   Value *bvhSrd = PoisonValue::get(FixedVectorType::get(m_builder->getInt32Ty(), 4));
-  bvhSrd =
-      m_builder->CreateInsertElement(bvhSrd, m_builder->getInt32(rtState->bvhResDesc.descriptorData[0]), uint64_t(0));
-  bvhSrd = m_builder->CreateInsertElement(bvhSrd, m_builder->getInt32(rtState->bvhResDesc.descriptorData[2]), 2u);
-  bvhSrd = m_builder->CreateInsertElement(bvhSrd, m_builder->getInt32(rtState->bvhResDesc.descriptorData[3]), 3u);
+  bvhSrd = m_builder->CreateInsertElement(bvhSrd, m_builder->getInt32(m_gpurtKey.bvhResDesc[0]), uint64_t(0));
+  bvhSrd = m_builder->CreateInsertElement(bvhSrd, m_builder->getInt32(m_gpurtKey.bvhResDesc[2]), 2u);
+  bvhSrd = m_builder->CreateInsertElement(bvhSrd, m_builder->getInt32(m_gpurtKey.bvhResDesc[3]), 3u);
 
-  Value *bvhSrdDw1 = m_builder->getInt32(rtState->bvhResDesc.descriptorData[1]);
+  Value *bvhSrdDw1 = m_builder->getInt32(m_gpurtKey.bvhResDesc[1]);
 
   if (expansion) {
     const unsigned BvhSrdBoxExpansionShift = 23;
@@ -942,6 +958,9 @@ void ProcessGpuRtLibrary::createContStackStore(llvm::Function *func) {
 // =====================================================================================================================
 // Fill in function to enqueue shader
 //
+// TODO: Once the handling of local root indices and continuation reference bit sizes has been unified, remove this
+//       method in favor of letting earlyGpurtTransform do everything.
+//
 // @param func : The function to create
 void ProcessGpuRtLibrary::createEnqueue(Function *func) {
   auto funcName = func->getName();
@@ -965,34 +984,29 @@ void ProcessGpuRtLibrary::createEnqueue(Function *func) {
   }
 
   // TODO: pass the levelMask correctly.
-  m_builder->create<cps::JumpOp>(addr, -1, PoisonValue::get(StructType::get(*m_context, {})), retAddr, tailArgs);
+  m_builder->create<cps::JumpOp>(addr, -1, PoisonValue::get(StructType::get(*m_context, {})),
+                                 PoisonValue::get(m_builder->getInt32Ty()), retAddr, tailArgs);
   m_builder->CreateUnreachable();
-}
 
-// Fill in function to check whether continuation stack is global
-//
-// @param func : The function to create
-void ProcessGpuRtLibrary::createContinuationStackIsGlobal(llvm::Function *func) {
-  m_builder->CreateRet(m_builder->create<GpurtContinuationStackIsGlobalOp>());
+  // Clear the name so that earlyGpurtTransform doesn't try to handle the function.
+  func->setName({});
 }
 
 // =====================================================================================================================
-// Fill in function to get RTIP
+// Fill in function to tell GPURT it is compiled from LLPC
 //
 // @param func : The function to create
-void ProcessGpuRtLibrary::createGetRtip(llvm::Function *func) {
-  auto rtip = m_context->getPipelineContext()->getRayTracingState()->rtIpVersion;
-  // The version is encoded as <major><minor> in decimal digits, so 11 is rtip 1.1, 20 is rtip 2.0
-  m_builder->CreateRet(m_builder->getInt32(rtip.major * 10 + rtip.minor));
+void ProcessGpuRtLibrary::createIsLlpc(llvm::Function *func) {
+  auto *trueConst = ConstantInt::getTrue(func->getContext());
+  m_builder->CreateRet(trueConst);
 }
 
 // =====================================================================================================================
-// Fill in function to tell GPURT it is compiled from LLPC
+// Fill in function to get the current functions shader record index
 //
 // @param func : The function to create
-void ProcessGpuRtLibrary::createIsLlpc(llvm::Function *func) {
-  auto *trueConst = ConstantInt::getTrue(func->getContext());
-  m_builder->CreateRet(trueConst);
+void ProcessGpuRtLibrary::createGetShaderRecordIndex(llvm::Function *func) {
+  m_builder->CreateRet(m_builder->create<lgc::rt::ShaderIndexOp>());
 }
 
 // =====================================================================================================================
diff --git a/llpc/lowering/ProcessGpuRtLibrary.h b/llpc/lowering/ProcessGpuRtLibrary.h
index 6f4ceb1aa6..23c7e527e5 100644
--- a/llpc/lowering/ProcessGpuRtLibrary.h
+++ b/llpc/lowering/ProcessGpuRtLibrary.h
@@ -35,12 +35,53 @@
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
+
+// =====================================================================================================================
+// Key that fully determines the cached GPURT library module.
+//
+// Code run during the one-time specialization of the GPURT library module must only depend on fields in this structure.
+// In particular, it must not depend directly on any fields from the pipeline context -- such fields must be passed
+// through the GpurtKey structure so that we can reliably test whether a cached GPURT module can be reused.
+struct GpurtKey {
+  Vkgc::RtIpVersion rtipVersion;
+  unsigned gpurtFeatureFlags;
+  llvm::SmallVector<uint32_t, 4> bvhResDesc;
+
+  struct {
+    bool valid;
+    uint32_t cpsFlags;
+    std::vector<Vkgc::GpurtOption> options; // sorted by nameHash
+  } rtPipeline;
+
+  // Returns true if this key is equal to or (strictly) _refines_ the other key. A key with RT pipeline settings
+  // can refine a key without if all the general settings (outside of rtPipeline) are equal.
+  bool refines(const GpurtKey &other) const {
+    if (!rtPipeline.valid && other.rtPipeline.valid)
+      return false;
+    if (rtPipeline.valid && other.rtPipeline.valid) {
+      if (rtPipeline.cpsFlags != other.rtPipeline.cpsFlags)
+        return false;
+      if (!llvm::equal(rtPipeline.options, other.rtPipeline.options,
+                       [](const Vkgc::GpurtOption &lhs, const Vkgc::GpurtOption &rhs) {
+                         return lhs.nameHash == rhs.nameHash && lhs.value == rhs.value;
+                       }))
+        return false;
+    }
+    return rtipVersion == other.rtipVersion && gpurtFeatureFlags == other.gpurtFeatureFlags &&
+           llvm::equal(bvhResDesc, other.bvhResDesc);
+  }
+};
+
 class ProcessGpuRtLibrary : public SpirvLower, public llvm::PassInfoMixin<ProcessGpuRtLibrary> {
 public:
-  ProcessGpuRtLibrary();
+  ProcessGpuRtLibrary(const GpurtKey &key);
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
 private:
+  // The key holding all the information necessary for specializing the GPURT module. No other state may be used to
+  // affect the specialization, in particular no state from the pipeline context.
+  const GpurtKey m_gpurtKey;
+
   typedef void (ProcessGpuRtLibrary::*LibraryFuncPtr)(llvm::Function *);
   struct LibraryFunctionTable {
     llvm::DenseMap<llvm::StringRef, LibraryFuncPtr> m_libFuncPtrs;
@@ -63,6 +104,7 @@ class ProcessGpuRtLibrary : public SpirvLower, public llvm::PassInfoMixin<Proces
   void createGetTriangleCompressionMode(llvm::Function *func);
   void createLoadDwordAtAddr(llvm::Function *func);
   void createLoadDwordAtAddrx2(llvm::Function *func);
+  void createLoadDwordAtAddrx3(llvm::Function *func);
   void createLoadDwordAtAddrx4(llvm::Function *func);
   void createConstantLoadDwordAtAddr(llvm::Function *func);
   void createConstantLoadDwordAtAddrx2(llvm::Function *func);
@@ -99,9 +141,8 @@ class ProcessGpuRtLibrary : public SpirvLower, public llvm::PassInfoMixin<Proces
   void createContStackStore(llvm::Function *func);
   void createFloatOpWithRoundMode(llvm::Function *func);
   void createEnqueue(llvm::Function *func);
-  void createContinuationStackIsGlobal(llvm::Function *func);
-  void createGetRtip(llvm::Function *func);
   void createIsLlpc(llvm::Function *func);
+  void createGetShaderRecordIndex(llvm::Function *func);
   void createShaderMarker(llvm::Function *func);
   void createWaveScan(llvm::Function *func);
   llvm::Value *createGetBvhSrd(llvm::Value *expansion, llvm::Value *boxSortMode);
diff --git a/llpc/lowering/ScalarReplacementOfBuiltins.cpp b/llpc/lowering/ScalarReplacementOfBuiltins.cpp
index deb8b0ab0f..3f6f0678bc 100644
--- a/llpc/lowering/ScalarReplacementOfBuiltins.cpp
+++ b/llpc/lowering/ScalarReplacementOfBuiltins.cpp
@@ -31,9 +31,11 @@
 #include "ScalarReplacementOfBuiltins.h"
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
+#include "llpcDialect.h"
 #include "vkgcDefs.h"
 #include "spirv/spirv.hpp"
 #include "lgc/Builder.h"
+#include "llvm/ADT/ADL.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Analysis.h"
@@ -77,7 +79,6 @@ PreservedAnalyses ScalarReplacementOfBuiltins::run(Module &module, ModuleAnalysi
     if (!needsSplit(global))
       continue;
 
-    // TODO: Handle the case where globalBuiltinVar is gl_in or gl_MeshVerticesEXT.
     if (global->getValueType()->isStructTy()) {
       splitBuiltinStructure(global);
       changed = true;
@@ -114,7 +115,7 @@ ShaderInOutMetadata ScalarReplacementOfBuiltins::getShaderInOutMetadata(Type *el
 // @param globalBuiltinVar : Global variable containing built-in type
 bool ScalarReplacementOfBuiltins::needsSplit(GlobalVariable *globalBuiltinVar) {
   auto addressSpace = globalBuiltinVar->getType()->getAddressSpace();
-  if (addressSpace != SPIRV::SPIRAS_Output)
+  if (addressSpace != SPIRV::SPIRAS_Output && addressSpace != SPIRV::SPIRAS_Input)
     return false;
 
   Type *valueType = globalBuiltinVar->getValueType();
@@ -275,55 +276,34 @@ void ScalarReplacementOfBuiltins::replaceGlobalBuiltinVar(GlobalVariable *global
     } else if (LoadInst *loadInst = dyn_cast<LoadInst>(user)) {
       GlobalVariable *LoadValue = cast<GlobalVariable>(elements[0]);
       loadInst->replaceUsesOfWith(globalBuiltinVar, LoadValue);
-    } else if (auto *gepInst = dyn_cast<GetElementPtrInst>(user)) {
+    } else if (auto *gepInst = dyn_cast<StructuralGepOp>(user)) {
       SmallVector<Value *, 8> indices;
-      GlobalVariable *globalValueReplace = nullptr;
-      Type *globalValueReplaceTy = nullptr;
-      unsigned index = UINT_MAX;
-
-      if (globalBuiltinVar->getValueType()->isStructTy()) {
-        // Note: The newly generated global variables are created based on the elements of the original global structure
-        // variable. Therefore, when encountering a GetElementPtr (GEP) instruction, we utilize the second operand to
-        // determine which of the newly generated global variables corresponds to a specific element in the original
-        // structure.
-        // Example:
-        // GEP Instruction: getelementptr ({ <4 x float>, float... }, ptr addrspace(65) @0, i32 0, i32 4)
-        // Here, `gepInst->idx_begin() + 1` retrieves the index to access the fourth element of the
-        // original structure (0-indexed), which corresponds to the fourth newly created global variable.
-        // This allows matching the GEP indices with the corresponding split global variables.
-        index = cast<ConstantInt>(gepInst->idx_begin() + 1)->getZExtValue();
-        indices.push_back(*(gepInst->idx_begin()));
-        unsigned int numIndices = gepInst->getNumIndices();
-        if (numIndices >= 3)
-          indices.append(gepInst->idx_begin() + 2, gepInst->idx_end());
-        assert(cast<ConstantInt>(indices[0])->isZero() && "Non-zero GEP first index\n");
-      } else if (globalBuiltinVar->getValueType()->isArrayTy()) {
-        // Note: The newly generated global variables are derived from the elements of the original array.
-        // When processing a GetElementPtr (GEP) instruction that navigates through such an array, the third operand
-        // (after the base pointer and the initial index which is typically zero) indicates the specific element
-        // in the array that is being accessed.
-        // Example:
-        // GEP Instruction: getelementptr [3 x { <4 x float>, ... }], ptr addrspace(65) @gl_out, i32 0, i32 %5, i32 4
-        // In this example, `gepInst->idx_begin() + 2` corresponds to `i32 4`, which is used to access the fourth
-        // element of the array (0-indexed). This element index is used to determine the appropriate newly created
-        // global variable that corresponds to this element in the original array structure. This indexing helps in
-        // directly mapping the GEP instruction indices to the split global variables.
-        index = cast<ConstantInt>(gepInst->idx_begin() + 2)->getZExtValue();
-        for (auto it = gepInst->idx_begin(); it != gepInst->idx_end(); ++it) {
-          if (it - gepInst->idx_begin() == 2)
-            continue;
-          indices.push_back(*it);
-        }
-      } else {
-        llvm_unreachable("Not implemented");
-      }
-
-      globalValueReplace = cast<GlobalVariable>(elements[index]);
-      globalValueReplaceTy = globalValueReplace->getValueType();
+      // NOTE: The newly generated global variables are created based on the elements of the original global structure
+      // variable or global array variable. Therefore, when encountering a GetElementPtr (GEP) instruction, we utilize
+      // the second operand to determine which of the newly generated global variables corresponds to a specific element
+      // in the original type.
+      // For example:
+      //   structure built-in: getelementptr { <4 x float>, float, ... }, ptr addrspace(65) @0, i32 0, i32 1
+      //   array built-in: getelementptr [3 x { <4 x float>, ... }], ptr addrspace(65) @1, i32 0, i32 %5, i32 0, i32 2
+      //  ===>
+      //   scalarized structure built-in: getelementptr float, ptr addrspace(65) @gl_out_0, i32 0
+      //   scalarized array built-in: getelementptr [3 x <4 x float>], ptr addrspace(65) @gl_out_1, i32 0, i32 %5, i32 2
+      //
+      // The first one index is always 0 dereference the pointer value. The element idx (1 if original global variable
+      // is a structure, or 2 if the original global variable is an array) indicates which built-in variable is used.
+      assert(globalBuiltinVar->getValueType()->isStructTy() || globalBuiltinVar->getValueType()->isArrayTy());
+      const auto indexRange = gepInst->getIndices();
+      const auto elementIdxIt =
+          std::next(llvm::adl_begin(indexRange), globalBuiltinVar->getValueType()->isStructTy() ? 1 : 2);
+      indices.append(llvm::adl_begin(indexRange), elementIdxIt);
+      // Remove the element index from the indices.
+      const unsigned index = cast<ConstantInt>(*elementIdxIt)->getZExtValue();
+      const auto indicesAfterElementIdx = llvm::make_range(std::next(elementIdxIt), llvm::adl_end(indexRange));
+      indices.append(llvm::adl_begin(indicesAfterElementIdx), llvm::adl_end(indicesAfterElementIdx));
+      assert(cast<ConstantInt>(indices[0])->isZero() && "Non-zero GEP first index\n");
+      Type *globalValueReplaceTy = cast<GlobalVariable>(elements[index])->getValueType();
       m_builder->SetInsertPoint(gepInst);
-      Value *gepElement =
-          m_builder->CreateGEP(globalValueReplaceTy, elements[index], indices, "",
-                               gepInst->isInBounds() ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none());
+      Value *gepElement = m_builder->create<StructuralGepOp>(elements[index], globalValueReplaceTy, false, indices);
       gepInst->replaceAllUsesWith(gepElement);
       gepInst->eraseFromParent();
     } else {
@@ -358,7 +338,7 @@ void ScalarReplacementOfBuiltins::splitBuiltinStructure(GlobalVariable *globalBu
     StringRef builtinElementName = getBuiltinElementName(inOutMeta);
     GlobalVariable *replacementBuiltinVar = new GlobalVariable(
         *m_module, elementType, false, GlobalValue::ExternalLinkage, nullptr, prefixName + builtinElementName, nullptr,
-        GlobalVariable::NotThreadLocal, SPIRV::SPIRAS_Output);
+        GlobalVariable::NotThreadLocal, globalBuiltinVar->getType()->getAddressSpace());
 
     replacementBuiltinVar->addMetadata(gSPIRVMD::InOut,
                                        *MDNode::get(*m_context, {ConstantAsMetadata::get(elementMetadata)}));
diff --git a/llpc/test/shaderdb/core/FMA_TestOperandIsZero.spvasm b/llpc/test/shaderdb/core/FMA_TestOperandIsZero.spvasm
index 3d20b207b5..0e61f43a2c 100644
--- a/llpc/test/shaderdb/core/FMA_TestOperandIsZero.spvasm
+++ b/llpc/test/shaderdb/core/FMA_TestOperandIsZero.spvasm
@@ -1,7 +1,7 @@
 ; Test on fma((b==0.0 ? 0.0 : a), (a==0.0 ? 0.0 : b), c)
 
 ; BEGIN_SHADERTEST
-; RUN: amdllpc --disable-fma=false -stop-after=lgc-patch-mul-dx9-zero %gfxip -o - %s | FileCheck -check-prefix=SHADERTEST %s
+; RUN: amdllpc --disable-fma=false -stop-after=lgc-lower-mul-dx9-zero %gfxip -o - %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: @lgc.shader.FS.main(
 ; SHADERTEST: call nnan float @llvm.amdgcn.fma.legacy(float {{.*}}, float {{.*}}, float {{.*}})
 ; SHADERTEST: call nnan float @llvm.amdgcn.fma.legacy(float {{.*}}, float {{.*}}, float {{.*}})
diff --git a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
index 44f5f4b79f..486e539c07 100644
--- a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
+++ b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
@@ -100,32 +100,33 @@
 ; SHADERTEST-NEXT:    [[TMP8:%.*]] = insertvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP5]], ptr addrspace(4) [[TMP6]], 3
 ; SHADERTEST-NEXT:    [[TMP9:%.*]] = insertvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP8]], i32 [[TMP7]], 4
 ; SHADERTEST-NEXT:    [[_11:%.*]] = insertvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP9]], i32 0, 5
-; SHADERTEST-NEXT:    [[TMP10:%.*]] = load i64, ptr addrspace(64) @_4, align 8
-; SHADERTEST-NEXT:    store i64 [[TMP10]], ptr addrspace(5) [[_12]], align 8
-; SHADERTEST-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(5) [[_12]], align 8
-; SHADERTEST-NEXT:    call void @spirv.NonUniform.i64(i64 [[TMP11]])
-; SHADERTEST-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
-; SHADERTEST-NEXT:    [[TMP13:%.*]] = getelementptr [4294967295 x i8], ptr null, i32 0, i32 [[TMP12]]
-; SHADERTEST-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i32
-; SHADERTEST-NEXT:    [[TMP15:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], 0
-; SHADERTEST-NEXT:    [[TMP16:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], 1
-; SHADERTEST-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP14]], [[TMP16]]
-; SHADERTEST-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP15]], i32 [[TMP17]]
-; SHADERTEST-NEXT:    [[TMP19:%.*]] = insertvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], ptr addrspace(4) [[TMP18]], 0
-; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP19]], 3
-; SHADERTEST-NEXT:    [[TMP21:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP19]], 4
-; SHADERTEST-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP14]], [[TMP21]]
-; SHADERTEST-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP20]], i32 [[TMP22]]
-; SHADERTEST-NEXT:    [[TMP24:%.*]] = insertvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP19]], ptr addrspace(4) [[TMP23]], 3
-; SHADERTEST-NEXT:    call void @"spirv.NonUniform.s[p4,i32,i32,p4,i32,i32]"({ ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP24]])
-; SHADERTEST-NEXT:    call void @"spirv.NonUniform.s[p4,i32,i32,p4,i32,i32]"({ ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP24]])
-; SHADERTEST-NEXT:    store { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP24]], ptr addrspace(5) [[TMP0]], align 8
-; SHADERTEST-NEXT:    [[TMP25:%.*]] = load { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 }, ptr addrspace(5) [[TMP0]], align 8
-; SHADERTEST-NEXT:    [[TMP26:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP25]], 0
-; SHADERTEST-NEXT:    [[TMP27:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP25]], 2
-; SHADERTEST-NEXT:    [[TMP28:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP25]], 3
-; SHADERTEST-NEXT:    [[TMP29:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP25]], 5
-; SHADERTEST-NEXT:    [[TMP30:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) [[TMP26]], ptr addrspace(4) [[TMP28]], i32 1, <2 x float> zeroinitializer)
-; SHADERTEST-NEXT:    store <4 x float> [[TMP30]], ptr addrspace(65) @_3, align 16
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = call ptr addrspace(64) (...) @llpc.structural.gep__p64(ptr addrspace(64) @_4, <2 x i64> poison, i1 false, i32 0, i32 0)
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(64) [[TMP10]], align 8
+; SHADERTEST-NEXT:    store i64 [[TMP11]], ptr addrspace(5) [[_12]], align 8
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = load i64, ptr addrspace(5) [[_12]], align 8
+; SHADERTEST-NEXT:    call void @spirv.NonUniform.i64(i64 [[TMP12]])
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = getelementptr [4294967295 x i8], ptr null, i32 0, i32 [[TMP13]]
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP14]] to i32
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], 0
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], 1
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP15]], [[TMP17]]
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP16]], i32 [[TMP18]]
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = insertvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[_11]], ptr addrspace(4) [[TMP19]], 0
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP20]], 3
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP20]], 4
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP15]], [[TMP22]]
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP21]], i32 [[TMP23]]
+; SHADERTEST-NEXT:    [[TMP25:%.*]] = insertvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP20]], ptr addrspace(4) [[TMP24]], 3
+; SHADERTEST-NEXT:    call void @"spirv.NonUniform.s[p4,i32,i32,p4,i32,i32]"({ ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP25]])
+; SHADERTEST-NEXT:    call void @"spirv.NonUniform.s[p4,i32,i32,p4,i32,i32]"({ ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP25]])
+; SHADERTEST-NEXT:    store { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP25]], ptr addrspace(5) [[TMP0]], align 8
+; SHADERTEST-NEXT:    [[TMP26:%.*]] = load { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 }, ptr addrspace(5) [[TMP0]], align 8
+; SHADERTEST-NEXT:    [[TMP27:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP26]], 0
+; SHADERTEST-NEXT:    [[TMP28:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP26]], 2
+; SHADERTEST-NEXT:    [[TMP29:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP26]], 3
+; SHADERTEST-NEXT:    [[TMP30:%.*]] = extractvalue { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 } [[TMP26]], 5
+; SHADERTEST-NEXT:    [[TMP31:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) [[TMP27]], ptr addrspace(4) [[TMP29]], i32 1, <2 x float> zeroinitializer)
+; SHADERTEST-NEXT:    store <4 x float> [[TMP31]], ptr addrspace(65) @_3, align 16
 ; SHADERTEST-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/core/OpAtomicAnd_TestInt64ImageAtomicAnd.spvasm b/llpc/test/shaderdb/core/OpAtomicAnd_TestInt64ImageAtomicAnd.spvasm
index 172de1d234..498d36264f 100644
--- a/llpc/test/shaderdb/core/OpAtomicAnd_TestInt64ImageAtomicAnd.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicAnd_TestInt64ImageAtomicAnd.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 8
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.and.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.and.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicCompareExchange_TestInt64ImageAtomicCompSwap.spvasm b/llpc/test/shaderdb/core/OpAtomicCompareExchange_TestInt64ImageAtomicCompSwap.spvasm
index 2c231c01cb..2b58dd9a5e 100644
--- a/llpc/test/shaderdb/core/OpAtomicCompareExchange_TestInt64ImageAtomicCompSwap.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicCompareExchange_TestInt64ImageAtomicCompSwap.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.compare.swap.i64(i32 1
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.cmpswap.2d.i64.i16(i64 %{{.*}}, i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.cmpswap.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicExchange_TestInt64ImageAtomicExchange.spvasm b/llpc/test/shaderdb/core/OpAtomicExchange_TestInt64ImageAtomicExchange.spvasm
index 82a61ae3fd..b62118d9f4 100644
--- a/llpc/test/shaderdb/core/OpAtomicExchange_TestInt64ImageAtomicExchange.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicExchange_TestInt64ImageAtomicExchange.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 0
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.swap.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.swap.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicIAdd_TestInt64ImageAtomicAdd.spvasm b/llpc/test/shaderdb/core/OpAtomicIAdd_TestInt64ImageAtomicAdd.spvasm
index bd58428a80..57142da842 100644
--- a/llpc/test/shaderdb/core/OpAtomicIAdd_TestInt64ImageAtomicAdd.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicIAdd_TestInt64ImageAtomicAdd.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 2
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicIDecrement_TestInt64ImageAtomicDecrement.spvasm b/llpc/test/shaderdb/core/OpAtomicIDecrement_TestInt64ImageAtomicDecrement.spvasm
index d1d6eae661..9f3c8b02c1 100644
--- a/llpc/test/shaderdb/core/OpAtomicIDecrement_TestInt64ImageAtomicDecrement.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicIDecrement_TestInt64ImageAtomicDecrement.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 3, {{.*}}, i64 1)
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.sub.2d.i64.i16(i64 1, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.sub.2d.i64.i16{{(\.v8i32)?}}(i64 1, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicIIncrement_TestInt64ImageAtomicIncrement.spvasm b/llpc/test/shaderdb/core/OpAtomicIIncrement_TestInt64ImageAtomicIncrement.spvasm
index 7a0965192a..473bea4d2d 100644
--- a/llpc/test/shaderdb/core/OpAtomicIIncrement_TestInt64ImageAtomicIncrement.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicIIncrement_TestInt64ImageAtomicIncrement.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 2, {{.*}}, i64 1)
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16(i64 1, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16{{(\.v8i32)?}}(i64 1, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicISub_TestInt64ImageAtomicSub.spvasm b/llpc/test/shaderdb/core/OpAtomicISub_TestInt64ImageAtomicSub.spvasm
index 7982e64b7c..e535c33cad 100644
--- a/llpc/test/shaderdb/core/OpAtomicISub_TestInt64ImageAtomicSub.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicISub_TestInt64ImageAtomicSub.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 3
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.sub.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.sub.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicLoad_TestInt64ImageAtomicLoad.spvasm b/llpc/test/shaderdb/core/OpAtomicLoad_TestInt64ImageAtomicLoad.spvasm
index b742314a10..677c6052e4 100644
--- a/llpc/test/shaderdb/core/OpAtomicLoad_TestInt64ImageAtomicLoad.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicLoad_TestInt64ImageAtomicLoad.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 2, {{.*}}, i64 0)
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16(i64 0, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16{{(\.v8i32)?}}(i64 0, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicOr_TestInt64ImageAtomicOr.spvasm b/llpc/test/shaderdb/core/OpAtomicOr_TestInt64ImageAtomicOr.spvasm
index 5a005af17e..039c4c760e 100644
--- a/llpc/test/shaderdb/core/OpAtomicOr_TestInt64ImageAtomicOr.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicOr_TestInt64ImageAtomicOr.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 9
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.or.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.or.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicSMax_TestInt64ImageAtomicMax.spvasm b/llpc/test/shaderdb/core/OpAtomicSMax_TestInt64ImageAtomicMax.spvasm
index 39278a9233..3ed4320662 100644
--- a/llpc/test/shaderdb/core/OpAtomicSMax_TestInt64ImageAtomicMax.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicSMax_TestInt64ImageAtomicMax.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 6
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.smax.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.smax.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicSMin_TestInt64ImageAtomicMin.spvasm b/llpc/test/shaderdb/core/OpAtomicSMin_TestInt64ImageAtomicMin.spvasm
index de4f499c04..a78d57f0c7 100644
--- a/llpc/test/shaderdb/core/OpAtomicSMin_TestInt64ImageAtomicMin.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicSMin_TestInt64ImageAtomicMin.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 4
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.smin.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.smin.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicStore_TestInt64ImageAtomicStore.spvasm b/llpc/test/shaderdb/core/OpAtomicStore_TestInt64ImageAtomicStore.spvasm
index 9e1962d9b9..52eff183e7 100644
--- a/llpc/test/shaderdb/core/OpAtomicStore_TestInt64ImageAtomicStore.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicStore_TestInt64ImageAtomicStore.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 0
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.swap.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.swap.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicUMax_TestInt64ImageAtomicMax.spvasm b/llpc/test/shaderdb/core/OpAtomicUMax_TestInt64ImageAtomicMax.spvasm
index a9ea8e775e..550da81a81 100644
--- a/llpc/test/shaderdb/core/OpAtomicUMax_TestInt64ImageAtomicMax.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicUMax_TestInt64ImageAtomicMax.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 7
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.umax.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.umax.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicUMin_TestInt64ImageAtomicMin.spvasm b/llpc/test/shaderdb/core/OpAtomicUMin_TestInt64ImageAtomicMin.spvasm
index d5d3749bbf..01196491f3 100644
--- a/llpc/test/shaderdb/core/OpAtomicUMin_TestInt64ImageAtomicMin.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicUMin_TestInt64ImageAtomicMin.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 5
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.umin.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.umin.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp
index f68b4bb460..138b3ccfe5 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp
@@ -214,94 +214,94 @@ void main()
 ; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 7, i32 512, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 9, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i16(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i16(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.3d.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.3d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 9, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.1darray.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2darray.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2dmsaa.i32.i16(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2darraymsaa.i32.i16(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.3d.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.3d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 9, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.1darray.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2darray.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2dmsaa.i32.i16(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2darraymsaa.i32.i16(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.3d.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.3d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 9, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.1darray.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2darray.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2dmsaa.i32.i16(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2darraymsaa.i32.i16(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.3d.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.3d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 9, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.1darray.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2darray.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2dmsaa.i32.i16(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2darraymsaa.i32.i16(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.3d.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.3d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 9, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.1darray.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2darray.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2dmsaa.i32.i16(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2darraymsaa.i32.i16(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.3d.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.3d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 9, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.1darray.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2darray.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2dmsaa.i32.i16(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2darraymsaa.i32.i16(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i16(i32 9, i32 3, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16(i32 9, i32 3, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.3d.i32.i16(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16(i32 9, i32 3, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.3d.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 9, i32 3, <4 x i32> %{{[0-9]*}}, i32 7, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.1darray.i32.i16(i32 9, i32 3, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2darray.i32.i16(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2dmsaa.i32.i16(i32 9, i32 3, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2darraymsaa.i32.i16(i32 9, i32 3, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.1darray.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2darray.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 9, i32 3, i16 7, i16 7, i16 7, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp
index 17d02659c0..966f5f2496 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp
@@ -21,9 +21,9 @@ void main()
 ; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 515, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 5, i32 5>, i32 9)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 5, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 5, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 5, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 5, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 5, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 5, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
index 24e47f5333..8eea7b86cf 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
@@ -65,23 +65,23 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.atomic.f32(i32 0, i32 1, i32 512, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16(i32 9, i32 8, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.umin.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.umax.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16(i32 9, i32 8, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}(float 9.000000e+00, {{i32|i16}} 7, {{i32|i16}} 7, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i32 8, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.umin.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.umax.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.2d.i32.i16{{(\.v8i32)?}}(i32 9, i32 8, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}{{(\.v8i32)?}}(float 9.000000e+00, {{i32|i16}} 7, {{i32|i16}} 7, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
index a9e619ecc8..9caf8ecd70 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
@@ -67,23 +67,23 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.atomic.f32(i32 0, i32 9, i32 512, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %{{.*}}, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16(i32 %{{.*}}, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16(i32 %{{.*}}, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2dmsaa.i32.i16(i32 %{{.*}}, i16 2, i16 2, i16 4, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i16(i32 %{{.*}}, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i16(i32 %{{.*}}, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %{{.*}}, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i16(i32 %{{.*}}, i32 28, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smin.2d.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.smax.2d.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2dmsaa.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 2, i16 2, i16 4, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i32 28, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 2, i32 0, i32 0
 ; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 1, i32 0, i32 0
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2darraymsaa.i32.i16(i32 %{{.*}}, i16 2, i16 2, i16 2, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.cube.i32.i16(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16(i32 %{{.*}}, i16 2, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16(i32 %{{.*}}, i32 17, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}(float %{{[-0-9A-Za0z_.]+}}, {{i32|i16}} 3, {{i32|i16}} 3, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.and.2darraymsaa.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 2, i16 2, i16 2, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.or.cube.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.xor.cube.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 2, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.swap.cube.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.cmpswap.cube.i32.i16{{(\.v8i32)?}}(i32 %{{.*}}, i32 17, i16 1, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.atomic.swap.2d.f32.{{i32|i16}}{{(\.v8i32)?}}(float %{{[-0-9A-Za0z_.]+}}, {{i32|i16}} 3, {{i32|i16}} 3, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpAtomicXor_TestInt64ImageAtomicXor.spvasm b/llpc/test/shaderdb/core/OpAtomicXor_TestInt64ImageAtomicXor.spvasm
index 09b2ab028f..31e42ab898 100644
--- a/llpc/test/shaderdb/core/OpAtomicXor_TestInt64ImageAtomicXor.spvasm
+++ b/llpc/test/shaderdb/core/OpAtomicXor_TestInt64ImageAtomicXor.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i64 (...) @lgc.create.image.atomic.i64(i32 10
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.xor.2d.i64.i16(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i64 @llvm.amdgcn.image.atomic.xor.2d.i64.i16{{(\.v8i32)?}}(i64 %{{.*}}, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpDecorationGroup_TestGroupAndGroupMember_lit.spvasm b/llpc/test/shaderdb/core/OpDecorationGroup_TestGroupAndGroupMember_lit.spvasm
index 4571be2480..24dcb427ef 100644
--- a/llpc/test/shaderdb/core/OpDecorationGroup_TestGroupAndGroupMember_lit.spvasm
+++ b/llpc/test/shaderdb/core/OpDecorationGroup_TestGroupAndGroupMember_lit.spvasm
@@ -1,12 +1,12 @@
 ; BEGIN_SHADERTEST
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: %{{[0-9]+}} = getelementptr <{ [4294967295 x float] }>, ptr addrspace(7) @{{.*}}, i32 0, i32 0, i32 %{{[0-9]+}}
-; SHADERTEST: %{{[0-9]+}} = getelementptr <{ [4294967295 x float] }>, ptr addrspace(7) @{{.*}}, i32 0, i32 0, i32 %{{[0-9]+}}
-; SHADERTEST: %{{[0-9]+}} = getelementptr <{ [4294967295 x float] }>, ptr addrspace(7) @{{.*}}, i32 0, i32 0, i32 %{{[0-9]+}}
-; SHADERTEST: %{{[0-9]+}} = getelementptr <{ [4294967295 x float] }>, ptr addrspace(7) @{{.*}}, i32 0, i32 0, i32 %{{[0-9]+}}
-; SHADERTEST: %{{[0-9]+}} = getelementptr <{ [4294967295 x float] }>, ptr addrspace(7) @{{.*}}, i32 0, i32 0, i32 %{{[0-9]+}}
-; SHADERTEST: %{{[0-9]+}} = getelementptr <{ [4294967295 x float] }>, ptr addrspace(7) @{{.*}}, i32 0, i32 0, i32 %{{[0-9]+}}
+; SHADERTEST-LABEL: {{^// LLPC}} final pipeline module info
+; SHADERTEST: %{{[0-9]+}} = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0), !invariant.load !{{[0-9]+}}
+; SHADERTEST: %{{[0-9]+}} = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0), !invariant.load !{{[0-9]+}}
+; SHADERTEST: %{{[0-9]+}} = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0), !invariant.load !{{[0-9]+}}
+; SHADERTEST: %{{[0-9]+}} = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0), !invariant.load !{{[0-9]+}}
+; SHADERTEST: %{{[0-9]+}} = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0), !invariant.load !{{[0-9]+}}
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 %37, <4 x i32> %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 ; END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/core/OpExtInst_PackHalf2x16.comp b/llpc/test/shaderdb/core/OpExtInst_PackHalf2x16.comp
new file mode 100644
index 0000000000..ace5b6557d
--- /dev/null
+++ b/llpc/test/shaderdb/core/OpExtInst_PackHalf2x16.comp
@@ -0,0 +1,25 @@
+#version 450 core
+
+layout(local_size_x = 64, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) buffer writeonly BufOut {
+    uint o[];
+};
+layout(set = 0, binding = 1) buffer readonly BufData {
+    float v[];
+};
+
+void main() {
+    uint ret = packHalf2x16(vec2(v[gl_LocalInvocationIndex], 0.));
+    o[gl_LocalInvocationIndex] = ret;
+}
+/*
+; BEGIN_SHADERTEST
+; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^// LLPC}} final pipeline module info
+; CHECK: %{{[0-9]+}} = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %{{[0-9]+}}, float 0.000000e+00)
+
+; CHECK: AMDLLPC SUCCESS
+; END_SHADERTEST
+*/
diff --git a/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm b/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm
index 385d139db3..5e848e689f 100644
--- a/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm
+++ b/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm
@@ -1,7 +1,7 @@
 ; Test on ((b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b))
 
 ; BEGIN_SHADERTEST
-; RUN: amdllpc -stop-after=lgc-patch-mul-dx9-zero %gfxip -o - %s | FileCheck -check-prefix=SHADERTEST %s
+; RUN: amdllpc -stop-after=lgc-lower-mul-dx9-zero %gfxip -o - %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: @lgc.shader.FS.main(
 ; SHADERTEST: call nnan float @llvm.amdgcn.fmul.legacy(float {{.*}}, float {{.*}})
 ; SHADERTEST: call nnan float @llvm.amdgcn.fmul.legacy(float {{.*}}, float {{.*}})
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag
index 380040dcfa..1d3b341562 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag
@@ -20,7 +20,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> {{.*}}@lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) [[IMAGEPTR]], ptr addrspace(4) [[SAMPLERPTR]],{{.*}},{{.*}} float 2.000000e+00
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float 2.000000e+00,
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, float 2.000000e+00,
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestOffset_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestOffset_lit.frag
index 2bbfe926ef..e3908d4b12 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestOffset_lit.frag
@@ -21,7 +21,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> {{.*}}@lgc.create.image.gather.v4f32(i32 1, i32 512,{{.*}} float 2.000000e+00
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1,{{.*}}, float 2.000000e+00,{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1,{{.*}}, float 2.000000e+00,{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag
index bceba33837..4210d31dbb 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag
@@ -32,9 +32,9 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 257, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 1,
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1,
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1,
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1,
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffsets_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffsets_lit.frag
index a3b19e5012..98213c8300 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffsets_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffsets_lit.frag
@@ -38,18 +38,18 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, {{.*}}, i32 801, <2 x float> <float 1.000000e+00, float 1.000000e+00>, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 1>, <2 x i32> <i32 2, i32 2>, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 4, i32 4>], float 0x3FE6666660000000)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 257, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 514, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 771, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 1028, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 1, i32 257, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 1, i32 514, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 1, i32 771, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 1, i32 1028, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16(i32 1, i32 257, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16(i32 1, i32 514, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16(i32 1, i32 771, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16(i32 1, i32 1028, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 514, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 771, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1028, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 514, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 771, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1028, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 514, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 771, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1028, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag
index 6d364695d0..b9e530a0ce 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag
@@ -37,9 +37,9 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 545, <2 x float> <float 1.000000e+00, float 1.000000e+00>, float 0.000000e+00, float 0x3FE6666660000000)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2darray.v4f32.f32(i32 1, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f16(i32 1, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, float 0x3FE99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, float 0x3FE6666660000000, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag b/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag
index 5e243f4bcd..da9aa00d59 100644
--- a/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag
@@ -20,7 +20,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.sample.f32(i32 1, i32 512, ptr addrspace(4){{.*}}, i32 801,{{.*}}, float 1.000000e+00, <2 x i32> <i32 2, i32 3>,
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.l.o.2d.f32.f32(i32 1, i32 770,{{.*}},{{.*}},{{.*}}, float 1.000000e+00,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.l.o.2d.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 770,{{.*}},{{.*}},{{.*}}, float 1.000000e+00,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_disableShadowTable_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_disableShadowTable_lit.frag
index 73c3fdd3c1..1fe0e6b137 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_disableShadowTable_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_disableShadowTable_lit.frag
@@ -9,7 +9,7 @@
 
 // SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 // "i32 2" is provided sample number
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, {{.*}}, {{.*}}, {{.*}}, i32 2, <8 x i32> %{{[0-9]*}}, i32 0, i32 0)
+// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32{{(\.v8i32)?}}(i32 15, {{.*}}, {{.*}}, {{.*}}, i32 2, <8 x i32> %{{[0-9]*}}, i32 0, i32 0)
 // SHADERTEST: AMDLLPC SUCCESS
 // END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_lit.frag
index 97f0d5411c..f6a572d3a5 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_Test2DMSArray_lit.frag
@@ -24,8 +24,8 @@ call { <4 x i32> addrspace(4)*, i32 } (...) @"lgc.create.get.sampler.desc.ptr.s[
 call <4 x float> (...) @lgc.create.image.load.with.fmask.v4f32(i32 7, i32 0,{{.*}}, i32 2)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-call i32 @llvm.amdgcn.image.load.3d.i32.i32(i32 1,{{.*}}, i32 0, i32 0)
-call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 {{.*}}, i32 0, i32 0)
+call i32 @llvm.amdgcn.image.load.3d.i32.i32{{(\.v8i32)?}}(i32 1,{{.*}}, i32 0, i32 0)
+call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32{{(\.v8i32)?}}(i32 15, i32 {{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageFetch_Test2DMS_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_Test2DMS_lit.frag
index d6d8cb499f..57cf432198 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_Test2DMS_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_Test2DMS_lit.frag
@@ -21,7 +21,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.with.fmask.v4f32(i32 6, i32 1536, {{.*}}, i32 2)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15,{{.*}},{{.*}},{{.*}},{{.*}}, i32 0, i32 0), !invariant.load
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32{{(\.v8i32)?}}(i32 15,{{.*}},{{.*}},{{.*}},{{.*}}, i32 0, i32 0), !invariant.load
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageFetch_TestBasic_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_TestBasic_lit.frag
index 0573f7f762..b2cc628704 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_TestBasic_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_TestBasic_lit.frag
@@ -20,7 +20,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 1536, {{.*}}, i32 2)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15,{{.*}},{{.*}}, i32 2,{{.*}}, i32 0, i32 0), !invariant.load
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32{{(\.v8i32)?}}(i32 15,{{.*}},{{.*}}, i32 2,{{.*}}, i32 0, i32 0), !invariant.load
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageFetch_TestIntegerSampler_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_TestIntegerSampler_lit.frag
index 6fc0d6d679..216a42493f 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_TestIntegerSampler_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_TestIntegerSampler_lit.frag
@@ -22,8 +22,8 @@ void main()
 ; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 1, i32 1536, {{.*}}, <2 x i32> <i32 0, i32 1>, i32 0)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2d.v4i32.i16(i32 15, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2d.v4i32.i16(i32 15, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2d.v4i32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2d.v4i32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageFetch_TestOffset_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_TestOffset_lit.frag
index aa7d8c40fd..c7e2ce52ea 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_TestOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_TestOffset_lit.frag
@@ -22,7 +22,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 1536, {{.*}}, i32 2)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15,{{.*}},{{.*}}, i32 2,{{.*}}, i32 0, i32 0), !invariant.load
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32{{(\.v8i32)?}}(i32 15,{{.*}},{{.*}}, i32 2,{{.*}}, i32 0, i32 0), !invariant.load
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetchOffset_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetchOffset_lit.frag
index 955b70ae62..4c1cd88342 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetchOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetchOffset_lit.frag
@@ -51,12 +51,12 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 5, i32 1664, {{.*}}, <3 x i32> <i32 4, i32 4, i32 1>, i32 2)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16(i32 15, i16 6, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16(i32 15, i16 12, i16 12, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i16(i32 15, i16 4, i16 4, i16 4, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 9, i16 9, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i16(i32 15, i16 12, i16 5, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i16(i32 15, i16 4, i16 4, i16 1, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 6, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 12, i16 12, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 4, i16 4, i16 4, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 9, i16 9, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 12, i16 5, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 4, i16 4, i16 1, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag
index bb559e78dd..34346a5f11 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag
@@ -40,12 +40,12 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.with.fmask.v4f32(i32 6, i32 1664, {{.*}}, {{.*}}, <2 x i32> <i32 6, i32 6>, i32 4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16(i32 15, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16(i32 15, i16 7, i16 7, i16 8, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 7, i16 7, i16 8, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32({{.*}}, i32 5, i32 0, i32 0, i32 0), !invariant.load
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16(i32 1, i16 6, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 6, i32 6,{{.*}},{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16{{(\.v8i32)?}}(i32 1, i16 6, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32{{(\.v8i32)?}}(i32 15, i32 6, i32 6,{{.*}},{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestBasic_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestBasic_lit.frag
index d3e607fc58..7bffbcffb6 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestBasic_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestBasic_lit.frag
@@ -20,7 +20,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, {{.*}}, i32 37, {{.*}}, i32 1, float 0.000000e+00)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 2,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag
index 428160ad38..98b0ae7c65 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag
@@ -21,10 +21,10 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x {{.*}}, i32 2, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>, <2 x i32> <i32 5, i32 6>, <2 x i32> <i32 7, i32 8>])
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 513,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 1027,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 1541,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 2055,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 513,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 1027,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 1541,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 2055,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag
index 6d1959e231..07fbf87ca1 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag
@@ -21,10 +21,10 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 801, <2 x {{.*}}, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>, <2 x i32> <i32 5, i32 6>, <2 x i32> <i32 7, i32 8>], float 1.000000e+00)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 513, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 1027, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 1541, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 2055, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 513, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1027, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1541, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 2055, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag b/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag
index 2eb83e3bf0..defe38dfa3 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag
@@ -33,18 +33,18 @@ void main()
 ; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 1>, <2 x i32> <i32 2, i32 2>, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 4, i32 4>])
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 513, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 257, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 514, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 771, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 1028, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 513, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 257, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 514, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 771, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 1028, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 513, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 514, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 771, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1028, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 513, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 514, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 771, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1028, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag
index 6509cb2964..da997e7848 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag
@@ -21,7 +21,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x {{.*}}, i32 2, float 0.000000e+00, <2 x {{.*}})
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4,{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4,{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag
index 9ed1e5cc97..9d42fa96ed 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag
@@ -79,26 +79,26 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <3 x {{.*}}, i32 1, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 2,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.cube.v4f32.f32.f32(i32 4,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.cube.v4f32.f32.f32(i32 8,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2darray.v4f32.f32.f32(i32 2, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2darray.v4f32.f32.f32(i32 2, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2darray.v4f32.f32.f32(i32 2, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.2darray.v4f32.f32(i32 2,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.cube.v4f32.f32(i32 4,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.cube.v4f32.f32(i32 8,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2darray.v4f32.f32(i32 2, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2darray.v4f32.f32(i32 2, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2darray.v4f32.f32(i32 2, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.cube.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.cube.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.o.2darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.cube.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.cube.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 256,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2, i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.l.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 2, i32 257,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag
index 38c9dc866d..8f889cb668 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag
@@ -36,9 +36,9 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 1.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, <2 x i32> <i32 1, i32 1>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4,{{.*}}, float 0x3FB99999A0000000, float 0x3FB99999A0000000,{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32(i32 8,{{.*}}, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00,{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 257, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4,{{.*}}, float 0x3FB99999A0000000, float 0x3FB99999A0000000,{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8,{{.*}}, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00,{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffsets_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffsets_lit.frag
index 3bb640f2f3..d839413a06 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffsets_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffsets_lit.frag
@@ -37,18 +37,18 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, {{.*}}, {{.*}}, i32 293, <2 x float> <float 1.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 1>, <2 x i32> <i32 2, i32 2>, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 4, i32 4>])
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 257, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 771, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 1028, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32(i32 8, i32 257, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32(i32 8, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32(i32 8, i32 771, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32(i32 8, i32 1028, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 257, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 514, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 771, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16(i32 1, i32 1028, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 257, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 771, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, i32 1028, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8, i32 257, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8, i32 771, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8, i32 1028, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 257, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 514, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 771, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, i32 1028, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag
index 65ad0c170f..41456de283 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag
@@ -35,9 +35,9 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <2 x float> <float 1.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 4, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2darray.v4f32.f32(i32 8, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 4, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 8, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0.000000e+00, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageReadWrite_TestImageLoadStoreLod_lit.comp b/llpc/test/shaderdb/core/OpImageReadWrite_TestImageLoadStoreLod_lit.comp
index 36e5dbefc1..cf780094e9 100644
--- a/llpc/test/shaderdb/core/OpImageReadWrite_TestImageLoadStoreLod_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageReadWrite_TestImageLoadStoreLod_lit.comp
@@ -57,19 +57,19 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.store({{.*}}, i32 8, i32 516, {{.*}}, <4 x i32> <i32 9, i32 9, i32 3, i32 1>, i32 7)
 
 ; SHADERTEST-LABEL: LLPC pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.1d.i32.i16(i32 1, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.2d.i32.i16(i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.3d.i32.i16(i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.cube.i32.i16(i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.1darray.i32.i16(i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.2darray.i32.i16(i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.1d.f32.i16(float %{{.*}}, i32 1, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.2d.f32.i16(float %{{.*}}, i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.3d.f32.i16(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.cube.f32.i16(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.1darray.f32.i16(float %{{.*}}, i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.2darray.f32.i16(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.cube.f32.i16(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.1d.i32.i16{{(\.v8i32)?}}(i32 1, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.2d.i32.i16{{(\.v8i32)?}}(i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.3d.i32.i16{{(\.v8i32)?}}(i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.cube.i32.i16{{(\.v8i32)?}}(i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.1darray.i32.i16{{(\.v8i32)?}}(i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.mip.2darray.i32.i16{{(\.v8i32)?}}(i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.1d.f32.i16{{(\.v8i32)?}}(float %{{.*}}, i32 1, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.2d.f32.i16{{(\.v8i32)?}}(float %{{.*}}, i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.3d.f32.i16{{(\.v8i32)?}}(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.cube.f32.i16{{(\.v8i32)?}}(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.1darray.f32.i16{{(\.v8i32)?}}(float %{{.*}}, i32 1, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.2darray.f32.i16{{(\.v8i32)?}}(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.mip.cube.f32.i16{{(\.v8i32)?}}(float %{{.*}}, i32 1, i16 9, i16 9, i16 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_Test2DMS_lit.comp b/llpc/test/shaderdb/core/OpImageRead_Test2DMS_lit.comp
index 86e3e21d7e..f9cd98d668 100644
--- a/llpc/test/shaderdb/core/OpImageRead_Test2DMS_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageRead_Test2DMS_lit.comp
@@ -26,8 +26,8 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 7, i32 512, {{.*}}, <4 x i32> <i32 0, i32 0, i32 0, i32 1>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16(i32 15, i16 0, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 0, i16 0, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestBasic_lit.comp b/llpc/test/shaderdb/core/OpImageRead_TestBasic_lit.comp
index 16ed00e2c1..73583ae900 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestBasic_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageRead_TestBasic_lit.comp
@@ -41,12 +41,12 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 9, i32 512, {{.*}}, <2 x i32> zeroinitializer)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 0, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i16(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i16(i32 15, i16 0, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestCube_lit.comp b/llpc/test/shaderdb/core/OpImageRead_TestCube_lit.comp
index f86cc7124f..324270fd4d 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestCube_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageRead_TestCube_lit.comp
@@ -26,7 +26,7 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 8, i32 512, {{.*}}, <4 x i32> <i32 0, i32 0, i32 0, i32 1>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16(i32 15, i16 0, i16 0, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, i16 6, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag b/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag
index 8b515b730b..760528d619 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag
@@ -40,11 +40,11 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 6, i32 512, {{.*}}, <3 x i32> <i32 8, i32 9, i32 2>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 2, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 2, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32({{.*}}, i32 4, i32 0, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16(i32 15, i16 5, i16 6, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16(i32 15, i16 8, i16 9, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 5, i16 6, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 8, i16 9, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm b/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm
index 7ab53b35e0..9e532f1912 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm
+++ b/llpc/test/shaderdb/core/OpImageRead_TestInt64ImageLoad.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call <4 x i64> (...) @lgc.create.image.load.v4i64
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !10
+; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16{{(\.v8i32)?}}(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !10
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestIntImage_lit.comp b/llpc/test/shaderdb/core/OpImageRead_TestIntImage_lit.comp
index 58a98840b4..2941d6589f 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestIntImage_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageRead_TestIntImage_lit.comp
@@ -26,7 +26,7 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4i32(i32 1, i32 512, {{.*}}, <2 x i32> zeroinitializer)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2d.v4i32.i16(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2d.v4i32.i16{{(\.v8i32)?}}(i32 15, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp b/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp
index edba3359b8..5bb6ae687d 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp
@@ -33,10 +33,10 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 515, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 4, i32 4>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 5)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 4, i16 4, <8 x i32> %{{.*}}, i32 0, i32 5)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 2, i16 2, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 5)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16{{(\.v8i32)?}}(i32 15, i16 4, i16 4, <8 x i32> %{{.*}}, i32 0, i32 5)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestNonVec4Data_lit.spvasm b/llpc/test/shaderdb/core/OpImageRead_TestNonVec4Data_lit.spvasm
index b4a50e27b2..172be262f4 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestNonVec4Data_lit.spvasm
+++ b/llpc/test/shaderdb/core/OpImageRead_TestNonVec4Data_lit.spvasm
@@ -17,15 +17,15 @@
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v3f32(i32 1, i32 512, {{.*}}, <2 x i32> <i32 1, i32 1>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16(i32 1, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16(i32 3, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call <3 x i32> @llvm.amdgcn.image.load.2d.v3i32.i16(i32 7, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16(i32 1, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16(i32 3, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call <3 x i32> @llvm.amdgcn.image.load.2d.v3i32.i16(i32 7, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.image.load.2d.f32.i16(i32 1, i16 0, i16 0,{{.*}}, i32 0, i32 0)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i16(i32 3, i16 0, i16 1,{{.*}}, i32 0, i32 0)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i16(i32 7, i16 1, i16 1,{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16{{(\.v8i32)?}}(i32 1, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16{{(\.v8i32)?}}(i32 3, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call <3 x i32> @llvm.amdgcn.image.load.2d.v3i32.i16{{(\.v8i32)?}}(i32 7, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16{{(\.v8i32)?}}(i32 1, i16 0, i16 0, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call <2 x i32> @llvm.amdgcn.image.load.2d.v2i32.i16{{(\.v8i32)?}}(i32 3, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call <3 x i32> @llvm.amdgcn.image.load.2d.v3i32.i16{{(\.v8i32)?}}(i32 7, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.image.load.2d.f32.i16{{(\.v8i32)?}}(i32 1, i16 0, i16 0,{{.*}}, i32 0, i32 0)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i16{{(\.v8i32)?}}(i32 3, i16 0, i16 1,{{.*}}, i32 0, i32 0)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i16{{(\.v8i32)?}}(i32 7, i16 1, i16 1,{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 ; END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestSubpassInput_lit.frag b/llpc/test/shaderdb/core/OpImageRead_TestSubpassInput_lit.frag
index 6ba114d321..8ca49de146 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestSubpassInput_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageRead_TestSubpassInput_lit.frag
@@ -40,9 +40,9 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.with.fmask.v4i32(i32 6, i32 608, {{.*}}, {{.*}}, <2 x i32> zeroinitializer, i32 7)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15,{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, {{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15,{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32{{(\.v8i32)?}}(i32 15,{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32{{(\.v8i32)?}}(i32 15, {{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32{{(\.v8i32)?}}(i32 15,{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestLod_lit.frag b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestLod_lit.frag
index a78d9d20f5..7616342fc0 100644
--- a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestLod_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestLod_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag
index 678ff351d7..26c2d819f4 100644
--- a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag
@@ -55,13 +55,13 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32({{.*}}, float 1.000000e+00, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 1.000000e+00, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.3d.v4f32.f32.f32({{.*}}, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, {{.*}})
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32({{.*}}, i32 514, float 1.000000e+00, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.3d.v4f32.f32.f32({{.*}}, i32 197379, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.3d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 514, float 1.000000e+00, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.3d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 197379, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradOffset_lit.frag b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradOffset_lit.frag
index 09c61dcd31..82829090d9 100644
--- a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradOffset_lit.frag
@@ -37,11 +37,11 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f16.f16(i32 15, i32 2, half 0xH4000, half 0xH4200, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f16.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, i32 2, half 0xH4000, half 0xH4200, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f16.f16(i32 15, i32 771, half 0xH4500, half 0xH4500, half 0xH4600, half 0xH4600, half 0xH4400, half 0xH4400, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f16.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, i32 771, half 0xH4500, half 0xH4500, half 0xH4600, half 0xH4600, half 0xH4400, half 0xH4400, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGrad_lit.frag b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGrad_lit.frag
index 064891b5a9..451809ce01 100644
--- a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGrad_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGrad_lit.frag
@@ -37,11 +37,11 @@ void main()
 ; SHADERTEST-LABEL: pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half 0xH4000, half 0xH4200, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH4000, half 0xH4200, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half 0xH4500, half 0xH4500, half 0xH4600, half 0xH4600, half 0xH4400, half 0xH4400, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH4500, half 0xH4500, half 0xH4600, half 0xH4600, half 0xH4400, half 0xH4400, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLodOffset_lit.frag b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLodOffset_lit.frag
index 86d578ab20..b594b9a6ae 100644
--- a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLodOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLodOffset_lit.frag
@@ -37,11 +37,11 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32({{.*}}, i32 6, float 5.000000e-01, float 0x3FD99999A0000000, {{.*}})
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 6, float 5.000000e-01, float 0x3FD99999A0000000, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32({{.*}}, i32 1285, float 0x3FE3333340000000, float 0x3FE3333340000000, float 0x3FE6666660000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 1285, float 0x3FE3333340000000, float 0x3FE3333340000000, float 0x3FE6666660000000, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLod_lit.frag b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLod_lit.frag
index df50628a44..2567074f9e 100644
--- a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLod_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureLod_lit.frag
@@ -37,11 +37,11 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32({{.*}}, float 5.000000e-01, float 0x3FD99999A0000000, {{.*}})
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 5.000000e-01, float 0x3FD99999A0000000, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32({{.*}}, float 0x3FE3333340000000, float 0x3FE3333340000000, float 0x3FE6666660000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 0x3FE3333340000000, float 0x3FE3333340000000, float 0x3FE6666660000000, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test1DArray_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test1DArray_lit.frag
index 99cb7c4dac..fc94b68322 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test1DArray_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test1DArray_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half 0xH3C00, half 0xH4000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3C00, half 0xH4000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DArray_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DArray_lit.frag
index 3a1f42440c..cdd1238b7a 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DArray_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DArray_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half 0xH3C00, half 0xH4000, half 0xH4200, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3C00, half 0xH4000, half 0xH4200, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DRect_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DRect_lit.frag
index 0beb3033f9..72ea603217 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DRect_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test2DRect_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH3C00, half 0xH4000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3C00, half 0xH4000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 true, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test3D_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test3D_lit.frag
index 2e4d911560..2b3e078071 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test3D_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_Test3D_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half 0xH3C00, half 0xH4000, half 0xH4200, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3C00, half 0xH4000, half 0xH4200, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag
index 5579768037..fe880adb9c 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag
@@ -23,7 +23,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag
index 79b46e8c01..3c7faf3f6b 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag
@@ -22,7 +22,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBias_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBias_lit.frag
index e82e7296f2..f4d6d1ae9a 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBias_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBias_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half 0xH3C00, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3C00, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestDrefGrad_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestDrefGrad_lit.frag
index e677751cc0..5add544d49 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestDrefGrad_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestDrefGrad_lit.frag
@@ -25,7 +25,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.d.2d.f32.f32.f32({{.*}}, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.d.2d.f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestGrad_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestGrad_lit.frag
index 6848eb667e..8e306467d5 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestGrad_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestGrad_lit.frag
@@ -25,7 +25,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32({{.*}}, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag
index 780f8fc1eb..bd10af17bc 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag
@@ -28,10 +28,10 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.sample.2d.v4i32.f16(i32 15, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.sample.2d.v4i32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.sample.2d.v4i32.f16(i32 15, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.sample.2d.v4i32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag
index 7109e65a27..d467efd21e 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag
@@ -23,7 +23,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST-LABEL: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST-LABEL: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3C00, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestOffset_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestOffset_lit.frag
index d38d6e6749..de7aeef368 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestOffset_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32({{.*}}, i32 770, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 770, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestProjDrefGradOffset_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestProjDrefGradOffset_lit.frag
index 489924d4cd..a91b393efe 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestProjDrefGradOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestProjDrefGradOffset_lit.frag
@@ -24,7 +24,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.d.o.2d.f32.f32.f32({{.*}}, i32 770, {{.*}}, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.d.o.2d.f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 770, {{.*}}, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag
index 60923e7b20..61bcbd5510 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag
@@ -23,7 +23,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag
index 3950cc0e5e..5ae78479d3 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag
@@ -76,30 +76,30 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 2.000000e+00, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.3d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.3d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubesc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubetc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubema(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubeid(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.cube.v4f32.f32.f32({{.*}}, float 2.000000e+00, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.cube.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 2.000000e+00, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1darray.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2darray.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.cube.v4f32.f32.f32({{.*}}, float 2.000000e+00, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.cube.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 2.000000e+00, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag
index 9afcacbc46..85ff1288c2 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag
@@ -76,13 +76,13 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32({{.*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.3d.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.3d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubesc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
@@ -92,10 +92,10 @@ void main()
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.cube.v4f32.f32
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1darray.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.cube.v4f32.f32
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag
index 1b71f891f6..f939a7f1a2 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag
@@ -78,14 +78,14 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.3d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.3d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubesc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
@@ -95,10 +95,10 @@ void main()
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.cube.v4f32.f32.f32
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1darray.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2darray.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.cube.v4f32.f32.f32
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag
index 00e98897f8..39a9f49448 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag
@@ -63,20 +63,20 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32({{.*}}, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.3d.v4f32.f32.f32({{.*}}, i32 131586, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.3d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 131586, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1darray.v4f32.f32.f32({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2darray.v4f32.f32.f32({{.*}}, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2darray.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag
index 14e3c9a49f..45c4eedda1 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag
@@ -63,20 +63,20 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32({{.*}}, i32 2, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 2, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32({{.*}}, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.3d.v4f32.f32({{.*}}, i32 131586, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.3d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 131586, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1darray.v4f32.f32({{.*}}, i32 2, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 2, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.2darray.v4f32.f32({{.*}}, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.2darray.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffset_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffset_lit.frag
index c42564c18c..2dbebcdfb2 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffset_lit.frag
@@ -36,11 +36,11 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32({{.*}}, i32 2, float 0x3FD99999A0000000, float 1.000000e+00, {{.*}})
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, i32 2, float 0x3FD99999A0000000, float 1.000000e+00, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f16(i32 15, i32 1285, half 0xH3800, half 0xH3800, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, i32 1285, half 0xH3800, half 0xH3800, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTexture_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTexture_lit.frag
index 0e8fb0c744..005016da61 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTexture_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTexture_lit.frag
@@ -37,11 +37,11 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32({{.*}}, float 0x3FD99999A0000000, float 1.000000e+00, {{.*}})
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float 0x3FD99999A0000000, float 1.000000e+00, {{.*}})
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH3800, half 0xH3800, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH3800, half 0xH3800, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm b/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm
index 679993ee9a..f500946c55 100644
--- a/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm
+++ b/llpc/test/shaderdb/core/OpImageSparseRead_TestInt64SparseImageLoad.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call { <4 x i64>, i32 } (...) @"lgc.create.image.load.s[v4i64,i32]"
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call { <2 x i32>, i32 } @llvm.amdgcn.image.load.2d.sl_v2i32i32s.i16(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 1, i32 0), !invariant.load !10
+; SHADERTEST: call { <2 x i32>, i32 } @llvm.amdgcn.image.load.2d.sl_v2i32i32s.i16{{(\.v8i32)?}}(i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 1, i32 0), !invariant.load !10
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpImageWrite_TestInt64ImageStore.spvasm b/llpc/test/shaderdb/core/OpImageWrite_TestInt64ImageStore.spvasm
index 7b6aff861a..8c80d8accd 100644
--- a/llpc/test/shaderdb/core/OpImageWrite_TestInt64ImageStore.spvasm
+++ b/llpc/test/shaderdb/core/OpImageWrite_TestInt64ImageStore.spvasm
@@ -3,7 +3,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call void (...) @lgc.create.image.store(<4 x i64>
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST:  call void @llvm.amdgcn.image.store.2d.v2f32.i16(<2 x float> %{{.*}}, i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST:  call void @llvm.amdgcn.image.store.2d.v2f32.i16{{(\.v8i32)?}}(<2 x float> %{{.*}}, i32 3, i16 3, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 
 ; SPIR-V
diff --git a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp
index c839780637..6d2c3194b9 100644
--- a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp
+++ b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp
@@ -15,7 +15,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: fence syncscope("agent") acq_rel
+; SHADERTEST: fence syncscope("workgroup") acq_rel
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpPtrEqualTest.spvasm b/llpc/test/shaderdb/core/OpPtrEqualTest.spvasm
index 2d521e743f..2864104bbb 100644
--- a/llpc/test/shaderdb/core/OpPtrEqualTest.spvasm
+++ b/llpc/test/shaderdb/core/OpPtrEqualTest.spvasm
@@ -88,12 +88,13 @@
 ; SHADERTEST-LABEL: @lgc.shader.CS.main(
 ; SHADERTEST-NEXT:  .entry:
 ; SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 2, i32 0, i32 2)
-; SHADERTEST-NEXT:    store i32 1, ptr addrspace(7) [[TMP0]], align 4
-; SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr {{i8|<{ [[]4294967295 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 0, i32 1}}
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 4, i32 0)
 ; SHADERTEST-NEXT:    store i32 1, ptr addrspace(7) [[TMP1]], align 4
-; SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr {{i8|<{ [[]4294967295 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 0, i32 2}}
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 4, i32 1)
 ; SHADERTEST-NEXT:    store i32 1, ptr addrspace(7) [[TMP2]], align 4
-; SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr {{i8|<{ [[]4294967295 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{12|0, i32 0, i32 3}}
-; SHADERTEST-NEXT:    store i32 0, ptr addrspace(7) [[TMP3]], align 4
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 4, i32 2)
+; SHADERTEST-NEXT:    store i32 1, ptr addrspace(7) [[TMP3]], align 4
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 4, i32 3)
+; SHADERTEST-NEXT:    store i32 0, ptr addrspace(7) [[TMP4]], align 4
 ; SHADERTEST-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
index c932e07fdb..8a63609974 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
@@ -36,6 +36,6 @@ void main()
 // SHADERTEST-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 // SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
 // SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
 // SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
index ebfaa14d26..4eb41bef9c 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
@@ -35,7 +35,7 @@ void main()
 // SHADERTEST-GFX-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 // SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
 // SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
 // SHADERTEST-GFX: AMDLLPC SUCCESS
 //
@@ -48,7 +48,7 @@ void main()
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 4
 // SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_0-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
 // SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
 //
@@ -65,6 +65,6 @@ void main()
 // SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 // SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
 // SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
 // SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
index 486b00f87d..3cc08fb052 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
@@ -45,7 +45,7 @@ void main()
 // SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 // SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
 // SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
 // SHADERTEST-GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -61,7 +61,7 @@ void main()
 // SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 // SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
 // SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 4
-// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
 //
 // SHADERTEST-GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -77,7 +77,7 @@ void main()
 // SHADERTEST-GFX-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 // SHADERTEST-GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
 // SHADERTEST-GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 4
-// SHADERTEST-GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
 // SHADERTEST-GFX: AMDLLPC SUCCESS
 //
@@ -90,7 +90,7 @@ void main()
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 4
 // SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
 // SHADERTEST-GFX_10_3_0-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -100,7 +100,7 @@ void main()
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 4
 // SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 4
-// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
 //
 // SHADERTEST-GFX_10_3_0: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -110,7 +110,7 @@ void main()
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 4
 // SHADERTEST-GFX_10_3_0-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 4
-// SHADERTEST-GFX_10_3_0-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_0-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
 // SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
 //
@@ -127,7 +127,7 @@ void main()
 // SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 // SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
 // SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
 // SHADERTEST-GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -141,7 +141,7 @@ void main()
 // SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 // SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
 // SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 4
-// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
 //
 // SHADERTEST-GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -155,6 +155,6 @@ void main()
 // SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 // SHADERTEST-GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
 // SHADERTEST-GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 4
-// SHADERTEST-GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
 // SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
index 69719c1be2..6548e3e885 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
@@ -38,7 +38,7 @@ void main()
 // SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 // SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
 // SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
 // SHADERTEST-GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -54,7 +54,7 @@ void main()
 // SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 // SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
 // SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 4
-// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
 // SHADERTEST-GFX: AMDLLPC SUCCESS
 
@@ -67,7 +67,7 @@ void main()
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 4
 // SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
 // SHADERTEST-GFX_10_3_0: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -77,7 +77,7 @@ void main()
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 4
 // SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
 // SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 4
-// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
 // SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
 
@@ -94,7 +94,7 @@ void main()
 // SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 // SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
 // SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 4
-// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
 // SHADERTEST-GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
@@ -108,6 +108,6 @@ void main()
 // SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 // SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
 // SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 4
-// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
 // SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
 // SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert b/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert
index 76a6e50288..0c83b70f46 100644
--- a/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert
+++ b/llpc/test/shaderdb/core/TestEnableImplicitInvariantExports.vert
@@ -19,7 +19,7 @@ void main()
 
 // BEGIN_WITHOUT_IIE
 /*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=WITHOUT_IIE %s
+; RUN: amdllpc -v --enable-implicit-invariant-exports=1 %gfxip %s | FileCheck -check-prefix=WITHOUT_IIE %s
 ; WITHOUT_IIE-LABEL: {{^// LLPC}} pipeline before-patching results
 ; WITHOUT_IIE: %[[val:.*]] = extractvalue [4 x <4 x float>] %{{.*}}, 3
 ; WITHOUT_IIE: %[[mul:.*]] = fmul nnan nsz <4 x float> %[[val]], %{{.*}}
diff --git a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
index ec588303a7..d30af81417 100644
--- a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
+++ b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
@@ -30,7 +30,7 @@ void main()
 //.
 // CHECK: attributes #[[ATTR0]] = { alwaysinline nounwind "denormal-fp-math-f32"="preserve-sign" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = {{{.*}} nounwind }
 //.
 // CHECK: [[META0:![0-9]+]] = !{!"Vulkan"}
 // CHECK: [[META1:![0-9]+]] = !{i32 1}
diff --git a/llpc/test/shaderdb/debug_info/NonSemanticShaderDebug.pipe b/llpc/test/shaderdb/debug_info/NonSemanticShaderDebug.pipe
index dcb10c1c50..5ad5647888 100644
--- a/llpc/test/shaderdb/debug_info/NonSemanticShaderDebug.pipe
+++ b/llpc/test/shaderdb/debug_info/NonSemanticShaderDebug.pipe
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
-; RUN: amdllpc -trim-debug-info=false -v -gfxip 10.1 %s | FileCheck -check-prefixes=CHECK %s
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --check-globals
+; RUN: amdllpc -trim-debug-info=false -v -gfxip 10.1 %s | FileCheck  -check-prefixes=CHECK  %s
 
 [CsSpirv]
 ; SPIR-V
@@ -248,23 +248,67 @@ userDataNode[1].visibility = 2
 userDataNode[1].type = StreamOutTableVaPtr
 userDataNode[1].offsetInDwords = 1
 userDataNode[1].sizeInDwords = 1
+;.
 ; CHECK-LABEL: // LLPC SPIRV-to-LLVM translation results
-: CHECK-NEXT : [[TMP1:!.*]] = !DIFile(filename: "builtIncompute.comp"
-: CHECK-NEXT : [[TMP2:!.*]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: [[TMP1]],
-: CHECK-NEXT : [[TMP3:!.*]] = !DISubroutineType(flags: DIFlagPublic
-: CHECK-NEXT : [[TMP4:!.*]] = !DILocalVariable(name: "sx", scope: [[TMP2]], file: [[TMP1]], line: 16, type: [[TMP5]])
-: CHECK-NEXT : [[TMP5:!.*]] = !DIBasicType(name: "float", size: 64, encoding: DW_ATE_float)
-: CHECK-NEXT : [[TMP6:!.*]] = !DILocalVariable(name: "sy", scope: [[TMP2]], file: [[TMP1]], line: 17, type: [[TMP5]])
-: CHECK-NEXT : [[TMP7:!.*]] = !DILocalVariable(name: "sz", scope: [[TMP2]], file: [[TMP1]], line: 18, type: [[TMP5]])
-: CHECK-NEXT : [[TMP8:!.*]] = !DILocalVariable(name: "offset", scope: [[TMP2]], file: [[TMP1]], line: 19, type: [[TMP9]])
-: CHECK-NEXT : [[TMP9:!.*]] = !DIBasicType(name: "uint", size: 32, encoding: DW_ATE_unsigned)
-: CHECK-NEXT : [[TMP10:!.*]] = !DILocalVariable(name: "wgidx", scope: [[TMP2]], file: [[TMP1]], line: 20, type: [[TMP9]])
-: CHECK-NEXT : [[TMP11:!.*]] = !DILocalVariable(name: "wgidy", scope: [[TMP2]], file: [[TMP1]], line: 21, type: [[TMP9]])
-: CHECK-NEXT : [[TMP12:!.*]] = !DILocation(line: 14, column: 16, scope: [[TMP2]])
-: CHECK-NEXT : [[TMP13:!.*]] = !DILocation(line: 16, scope: [[TMP2]])
-: CHECK-NEXT : [[TMP14:!.*]] = !DILocation(line: 17, scope: [[TMP2]])
-: CHECK-NEXT : [[TMP15:!.*]] = !DILocation(line: 18, scope: [[TMP2]])
-: CHECK-NEXT : [[TMP16:!.*]] = !DILocation(line: 19, scope: [[TMP2]])
-: CHECK-NEXT : [[TMP17:!.*]] = !DILocation(line: 20, scope: [[TMP2]])
-: CHECK-NEXT : [[TMP18:!.*]] = !DILocation(line: 21, scope: [[TMP2]])
-: CHECK-NEXT : [[TMP19:!.*]] = !DILocation(line: 22, scope: [[TMP2]])
+; CHECK-NEXT : [[META0:![0-9]+]] = !{i32 0, i32 0}
+; CHECK-NEXT : [[META1:![0-9]+]] = !{{ i64, { i64 } } { i64 70368744177664, { i64 } { i64 8796093022208 } }}
+; CHECK-NEXT : [[META2:![0-9]+]] = !{{ i64, i64 } { i64 17039388, i64 0 }}
+; CHECK-NEXT : [[META3:![0-9]+]] = !{{ i64, i64 } { i64 17039386, i64 0 }}
+; CHECK-NEXT : [[META4:![0-9]+]] = !{i32 0, i32 1}
+; CHECK-NEXT : [[META5:![0-9]+]] = !{{ i64, { { i32, i64, i64 } } } { i64 70368744177664, { { i32, i64, i64 } } { { i32, i64, i64 } { i32 16, i64 0, i64 0 } } }}
+; CHECK-NEXT : [[META6:![0-9]+]] = !{{ i64, i64 } { i64 17039384, i64 0 }}
+; CHECK-NEXT : [[META7:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
+; CHECK-NEXT : [[META8:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CHECK-NEXT : [[META9:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !10, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !11)
+; CHECK-NEXT : [[META10:![0-9]+]] = !DIFile(filename: "builtIncompute.comp", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 460 core\0A#extension GL_KHR_shader_subgroup_basic : enable\0Alayout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in
+; CHECK-NEXT : [[META11:![0-9]+]] = !{!12, !21, !26, !28, !36}
+; CHECK-NEXT : [[META12:![0-9]+]] = !DIGlobalVariableExpression(var: !13, expr: !DIExpression())
+; CHECK-NEXT : [[META13:![0-9]+]] = distinct !DIGlobalVariable(name: "stride", linkageName: "stride", scope: !9, file: !10, line: 19, type: !14, isLocal: false, isDefinition: true)
+; CHECK-NEXT : [[META14:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "Stride", file: !10, line: 19, flags: DIFlagPublic, elements: !15, identifier: "Stride")
+; CHECK-NEXT : [[META15:![0-9]+]] = !{!16}
+; CHECK-NEXT : [[META16:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "u_stride", file: !10, line: 7, baseType: !17, flags: DIFlagPublic)
+; CHECK-NEXT : [[META17:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 64, flags: DIFlagVector, elements: !19)
+; CHECK-NEXT : [[META18:![0-9]+]] = !DIBasicType(name: "uint", size: 32, encoding: DW_ATE_unsigned)
+; CHECK-NEXT : [[META19:![0-9]+]] = !{!20}
+; CHECK-NEXT : [[META20:![0-9]+]] = !DISubrange(count: 2, lowerBound: 0)
+; CHECK-NEXT : [[META21:![0-9]+]] = !DIGlobalVariableExpression(var: !22, expr: !DIExpression())
+; CHECK-NEXT : [[META22:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_GlobalInvocationID", linkageName: "gl_GlobalInvocationID", scope: !9, file: !10, line: 19, type: !23, isLocal: false, isDefinition: true)
+; CHECK-NEXT : [[META23:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 96, flags: DIFlagVector, elements: !24)
+; CHECK-NEXT : [[META24:![0-9]+]] = !{!25}
+; CHECK-NEXT : [[META25:![0-9]+]] = !DISubrange(count: 3, lowerBound: 0)
+; CHECK-NEXT : [[META26:![0-9]+]] = !DIGlobalVariableExpression(var: !27, expr: !DIExpression())
+; CHECK-NEXT : [[META27:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_WorkGroupID", linkageName: "gl_WorkGroupID", scope: !9, file: !10, line: 20, type: !23, isLocal: false, isDefinition: true)
+; CHECK-NEXT : [[META28:![0-9]+]] = !DIGlobalVariableExpression(var: !29, expr: !DIExpression())
+; CHECK-NEXT : [[META29:![0-9]+]] = distinct !DIGlobalVariable(name: "sb_out", linkageName: "sb_out", scope: !9, file: !10, line: 22, type: !30, isLocal: false, isDefinition: true)
+; CHECK-NEXT : [[META30:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "Output", file: !10, line: 22, flags: DIFlagPublic, elements: !31, identifier: "Output")
+; CHECK-NEXT : [[META31:![0-9]+]] = !{!32}
+; CHECK-NEXT : [[META32:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "result", file: !10, line: 11, baseType: !33, flags: DIFlagPublic)
+; CHECK-NEXT : [[META33:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !23, elements: !34)
+; CHECK-NEXT : [[META34:![0-9]+]] = !{!35}
+; CHECK-NEXT : [[META35:![0-9]+]] = !DISubrange(count: 0, lowerBound: 0)
+; CHECK-NEXT : [[META36:![0-9]+]] = !DIGlobalVariableExpression(var: !37, expr: !DIExpression())
+; CHECK-NEXT : [[META37:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_NumWorkGroups", linkageName: "gl_NumWorkGroups", scope: !9, file: !10, line: 22, type: !23, isLocal: false, isDefinition: true)
+; CHECK-NEXT : [[META38:![0-9]+]] = !{i32 1, i32 1, i32 1}
+; CHECK-NEXT : [[META39:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !10, type: !40, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !9, templateParams: !50, retainedNodes: !42)
+; CHECK-NEXT : [[META40:![0-9]+]] = !DISubroutineType(flags: DIFlagPublic, types: !41)
+; CHECK-NEXT : [[META41:![0-9]+]] = !{null}
+; CHECK-NEXT : [[META42:![0-9]+]] = !{!43, !45, !46, !47, !48, !49}
+; CHECK-NEXT : [[META43:![0-9]+]] = !DILocalVariable(name: "sx", scope: !39, file: !10, line: 16, type: !44)
+; CHECK-NEXT : [[META44:![0-9]+]] = !DIBasicType(name: "float", size: 64, encoding: DW_ATE_float)
+; CHECK-NEXT : [[META45:![0-9]+]] = !DILocalVariable(name: "sy", scope: !39, file: !10, line: 17, type: !44)
+; CHECK-NEXT : [[META46:![0-9]+]] = !DILocalVariable(name: "sz", scope: !39, file: !10, line: 18, type: !44)
+; CHECK-NEXT : [[META47:![0-9]+]] = !DILocalVariable(name: "offset", scope: !39, file: !10, line: 19, type: !18)
+; CHECK-NEXT : [[META48:![0-9]+]] = !DILocalVariable(name: "wgidx", scope: !39, file: !10, line: 20, type: !18)
+; CHECK-NEXT : [[META49:![0-9]+]] = !DILocalVariable(name: "wgidy", scope: !39, file: !10, line: 21, type: !18)
+; CHECK-NEXT : [[META50:![0-9]+]] = !{}
+; CHECK-NEXT : [[META51:![0-9]+]] = !{i32 5}
+; CHECK-NEXT : [[META52:![0-9]+]] = !{i32 7}
+; CHECK-NEXT : [[META53:![0-9]+]] = !DILocation(line: 14, column: 16, scope: !39)
+; CHECK-NEXT : [[META54:![0-9]+]] = !DILocation(line: 16, scope: !39)
+; CHECK-NEXT : [[META55:![0-9]+]] = !DILocation(line: 17, scope: !39)
+; CHECK-NEXT : [[META56:![0-9]+]] = !DILocation(line: 18, scope: !39)
+; CHECK-NEXT : [[META57:![0-9]+]] = !DILocation(line: 19, scope: !39)
+; CHECK-NEXT : [[META58:![0-9]+]] = !DILocation(line: 20, scope: !39)
+; CHECK-NEXT : [[META59:![0-9]+]] = !DILocation(line: 21, scope: !39)
+; CHECK-NEXT : [[META60:![0-9]+]] = !DILocation(line: 22, scope: !39)
+;.
diff --git a/llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe b/llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe
new file mode 100644
index 0000000000..444dca1613
--- /dev/null
+++ b/llpc/test/shaderdb/debug_info/PipelineGsTess_TestVsTesGsMergeShader.pipe
@@ -0,0 +1,1120 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --check-globals
+; BEGIN_SHADERTEST
+; RUN: amdllpc --print-after=lgc-patch-prepare-pipeline-abi --enable-implicit-invariant-exports=1 -trim-debug-info=false 2>&1 %s | FileCheck -check-prefix=SHADERTEST %s
+
+[Version]
+version = 40
+
+[VsSpirv]
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 43
+; Schema: 0
+               OpCapability Shader
+               OpExtension "SPV_KHR_non_semantic_info"
+          %2 = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
+          %3 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Vertex %main "main" %positionOut %position
+          %1 = OpString "test.vert"
+          %8 = OpString "uint"
+         %15 = OpString "main"
+         %18 = OpString "// OpModuleProcessed client vulkan100
+// OpModuleProcessed target-env vulkan1.0
+// OpModuleProcessed entry-point main
+#line 1
+#version 450
+layout(location = 0) in vec4 position;
+layout(location = 0) out vec4 positionOut;
+
+void main (void)
+{
+    positionOut = position;
+}"
+         %29 = OpString "float"
+         %36 = OpString "positionOut"
+         %41 = OpString "position"
+               OpName %main "main"
+               OpName %positionOut "positionOut"
+               OpName %position "position"
+               OpDecorate %positionOut Location 0
+               OpDecorate %position Location 0
+       %void = OpTypeVoid
+          %5 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+    %uint_32 = OpConstant %uint 32
+     %uint_6 = OpConstant %uint 6
+     %uint_0 = OpConstant %uint 0
+          %9 = OpExtInst %void %2 DebugTypeBasic %8 %uint_32 %uint_6 %uint_0
+     %uint_3 = OpConstant %uint 3
+          %6 = OpExtInst %void %2 DebugTypeFunction %uint_3 %void
+         %17 = OpExtInst %void %2 DebugSource %1 %18
+     %uint_1 = OpConstant %uint 1
+     %uint_4 = OpConstant %uint 4
+     %uint_2 = OpConstant %uint 2
+         %19 = OpExtInst %void %2 DebugCompilationUnit %uint_1 %uint_4 %17 %uint_2
+         %16 = OpExtInst %void %2 DebugFunction %15 %6 %17 %uint_0 %uint_0 %19 %15 %uint_3 %uint_0
+     %uint_7 = OpConstant %uint 7
+      %float = OpTypeFloat 32
+         %30 = OpExtInst %void %2 DebugTypeBasic %29 %uint_32 %uint_3 %uint_0
+    %v4float = OpTypeVector %float 4
+         %32 = OpExtInst %void %2 DebugTypeVector %30 %uint_4
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+%positionOut = OpVariable %_ptr_Output_v4float Output
+     %uint_8 = OpConstant %uint 8
+         %35 = OpExtInst %void %2 DebugGlobalVariable %36 %32 %17 %uint_7 %uint_0 %19 %36 %positionOut %uint_8
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+   %position = OpVariable %_ptr_Input_v4float Input
+         %40 = OpExtInst %void %2 DebugGlobalVariable %41 %32 %17 %uint_7 %uint_0 %19 %41 %position %uint_8
+               OpLine %1 5 16
+       %main = OpFunction %void None %5
+         %23 = OpLabel
+         %24 = OpExtInst %void %2 DebugFunctionDefinition %16 %main
+         %25 = OpExtInst %void %2 DebugScope %16
+         %26 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+         %42 = OpLoad %v4float %position
+               OpStore %positionOut %42
+               OpReturn
+               OpFunctionEnd
+
+[VsInfo]
+entryPoint = main
+
+
+[TcsSpirv]
+               ; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 112
+; Schema: 0
+               OpCapability Tessellation
+               OpExtension "SPV_KHR_non_semantic_info"
+          %2 = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
+          %3 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationControl %main "main" %gl_TessLevelOuter %gl_TessLevelInner %gl_out %gl_InvocationID %gl_in
+               OpExecutionMode %main OutputVertices 3
+          %1 = OpString "test.tesc"
+          %8 = OpString "uint"
+         %15 = OpString "main"
+         %18 = OpString "// OpModuleProcessed client vulkan100
+// OpModuleProcessed target-env vulkan1.0
+// OpModuleProcessed entry-point main
+#line 1
+#version 450 core
+layout(vertices=3) out;
+
+void main(void)
+{
+    gl_TessLevelOuter[0] = 2.0;
+    gl_TessLevelOuter[1] = 2.0;
+    gl_TessLevelOuter[2] = 2.0;
+    gl_TessLevelInner[0] = 4.0;
+
+    gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID].gl_Position;
+}"
+         %28 = OpString "float"
+         %35 = OpString "gl_TessLevelOuter"
+         %38 = OpString "int"
+         %58 = OpString "gl_TessLevelInner"
+         %69 = OpString "gl_Position"
+         %72 = OpString "gl_PointSize"
+         %75 = OpString "gl_CullDistance"
+         %79 = OpString "gl_PerVertex"
+         %85 = OpString "gl_out"
+         %89 = OpString "gl_InvocationID"
+        %105 = OpString "gl_in"
+               OpName %main "main"
+               OpName %gl_TessLevelOuter "gl_TessLevelOuter"
+               OpName %gl_TessLevelInner "gl_TessLevelInner"
+               OpName %gl_PerVertex "gl_PerVertex"
+               OpMemberName %gl_PerVertex 0 "gl_Position"
+               OpMemberName %gl_PerVertex 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex 3 "gl_CullDistance"
+               OpName %gl_out "gl_out"
+               OpName %gl_InvocationID "gl_InvocationID"
+               OpName %gl_PerVertex_0 "gl_PerVertex"
+               OpMemberName %gl_PerVertex_0 0 "gl_Position"
+               OpMemberName %gl_PerVertex_0 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex_0 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex_0 3 "gl_CullDistance"
+               OpName %gl_in "gl_in"
+               OpDecorate %gl_TessLevelOuter Patch
+               OpDecorate %gl_TessLevelOuter BuiltIn TessLevelOuter
+               OpDecorate %gl_TessLevelInner Patch
+               OpDecorate %gl_TessLevelInner BuiltIn TessLevelInner
+               OpMemberDecorate %gl_PerVertex 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex Block
+               OpDecorate %gl_InvocationID BuiltIn InvocationId
+               OpMemberDecorate %gl_PerVertex_0 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex_0 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex_0 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex_0 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex_0 Block
+       %void = OpTypeVoid
+          %5 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+    %uint_32 = OpConstant %uint 32
+     %uint_6 = OpConstant %uint 6
+     %uint_0 = OpConstant %uint 0
+          %9 = OpExtInst %void %2 DebugTypeBasic %8 %uint_32 %uint_6 %uint_0
+     %uint_3 = OpConstant %uint 3
+          %6 = OpExtInst %void %2 DebugTypeFunction %uint_3 %void
+         %17 = OpExtInst %void %2 DebugSource %1 %18
+     %uint_1 = OpConstant %uint 1
+     %uint_4 = OpConstant %uint 4
+     %uint_2 = OpConstant %uint 2
+         %19 = OpExtInst %void %2 DebugCompilationUnit %uint_1 %uint_4 %17 %uint_2
+         %16 = OpExtInst %void %2 DebugFunction %15 %6 %17 %uint_0 %uint_0 %19 %15 %uint_3 %uint_0
+      %float = OpTypeFloat 32
+         %29 = OpExtInst %void %2 DebugTypeBasic %28 %uint_32 %uint_3 %uint_0
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+         %31 = OpExtInst %void %2 DebugTypeArray %29 %uint_4
+%_ptr_Output__arr_float_uint_4 = OpTypePointer Output %_arr_float_uint_4
+%gl_TessLevelOuter = OpVariable %_ptr_Output__arr_float_uint_4 Output
+     %uint_8 = OpConstant %uint 8
+         %34 = OpExtInst %void %2 DebugGlobalVariable %35 %31 %17 %uint_6 %uint_0 %19 %35 %gl_TessLevelOuter %uint_8
+        %int = OpTypeInt 32 1
+         %39 = OpExtInst %void %2 DebugTypeBasic %38 %uint_32 %uint_4 %uint_0
+      %int_0 = OpConstant %int 0
+    %float_2 = OpConstant %float 2
+%_ptr_Output_float = OpTypePointer Output %float
+     %uint_7 = OpConstant %uint 7
+      %int_1 = OpConstant %int 1
+      %int_2 = OpConstant %int 2
+     %uint_9 = OpConstant %uint 9
+%_arr_float_uint_2 = OpTypeArray %float %uint_2
+         %54 = OpExtInst %void %2 DebugTypeArray %29 %uint_2
+%_ptr_Output__arr_float_uint_2 = OpTypePointer Output %_arr_float_uint_2
+%gl_TessLevelInner = OpVariable %_ptr_Output__arr_float_uint_2 Output
+         %57 = OpExtInst %void %2 DebugGlobalVariable %58 %54 %17 %uint_9 %uint_0 %19 %58 %gl_TessLevelInner %uint_8
+    %float_4 = OpConstant %float 4
+    %uint_11 = OpConstant %uint 11
+    %v4float = OpTypeVector %float 4
+         %64 = OpExtInst %void %2 DebugTypeVector %29 %uint_4
+%_arr_float_uint_1 = OpTypeArray %float %uint_1
+         %66 = OpExtInst %void %2 DebugTypeArray %29 %uint_1
+%gl_PerVertex = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+   %uint_110 = OpConstant %uint 110
+         %68 = OpExtInst %void %2 DebugTypeMember %69 %64 %17 %uint_1 %uint_110 %uint_0 %uint_0 %uint_3
+   %uint_128 = OpConstant %uint 128
+         %71 = OpExtInst %void %2 DebugTypeMember %72 %29 %17 %uint_1 %uint_128 %uint_0 %uint_0 %uint_3
+   %uint_171 = OpConstant %uint 171
+         %74 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_171 %uint_0 %uint_0 %uint_3
+         %77 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_171 %uint_0 %uint_0 %uint_3
+         %78 = OpExtInst %void %2 DebugTypeComposite %79 %uint_1 %17 %uint_11 %uint_0 %19 %79 %uint_0 %uint_3 %68 %71 %74 %77
+%_arr_gl_PerVertex_uint_3 = OpTypeArray %gl_PerVertex %uint_3
+         %81 = OpExtInst %void %2 DebugTypeArray %78 %uint_3
+%_ptr_Output__arr_gl_PerVertex_uint_3 = OpTypePointer Output %_arr_gl_PerVertex_uint_3
+     %gl_out = OpVariable %_ptr_Output__arr_gl_PerVertex_uint_3 Output
+         %84 = OpExtInst %void %2 DebugGlobalVariable %85 %81 %17 %uint_11 %uint_0 %19 %85 %gl_out %uint_8
+%_ptr_Input_int = OpTypePointer Input %int
+%gl_InvocationID = OpVariable %_ptr_Input_int Input
+         %88 = OpExtInst %void %2 DebugGlobalVariable %89 %39 %17 %uint_11 %uint_0 %19 %89 %gl_InvocationID %uint_8
+%gl_PerVertex_0 = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+  %uint_1756 = OpConstant %uint 1756
+         %92 = OpExtInst %void %2 DebugTypeMember %69 %64 %17 %uint_1 %uint_1756 %uint_0 %uint_0 %uint_3
+  %uint_1774 = OpConstant %uint 1774
+         %94 = OpExtInst %void %2 DebugTypeMember %72 %29 %17 %uint_1 %uint_1774 %uint_0 %uint_0 %uint_3
+  %uint_1817 = OpConstant %uint 1817
+         %96 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_1817 %uint_0 %uint_0 %uint_3
+         %98 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_1817 %uint_0 %uint_0 %uint_3
+         %99 = OpExtInst %void %2 DebugTypeComposite %79 %uint_1 %17 %uint_11 %uint_0 %19 %79 %uint_0 %uint_3 %92 %94 %96 %98
+%_arr_gl_PerVertex_0_uint_32 = OpTypeArray %gl_PerVertex_0 %uint_32
+        %101 = OpExtInst %void %2 DebugTypeArray %99 %uint_32
+%_ptr_Input__arr_gl_PerVertex_0_uint_32 = OpTypePointer Input %_arr_gl_PerVertex_0_uint_32
+      %gl_in = OpVariable %_ptr_Input__arr_gl_PerVertex_0_uint_32 Input
+        %104 = OpExtInst %void %2 DebugGlobalVariable %105 %101 %17 %uint_11 %uint_0 %19 %105 %gl_in %uint_8
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+               OpLine %1 4 15
+       %main = OpFunction %void None %5
+         %23 = OpLabel
+         %24 = OpExtInst %void %2 DebugFunctionDefinition %16 %main
+         %25 = OpExtInst %void %2 DebugScope %16
+         %26 = OpExtInst %void %2 DebugLine %17 %uint_6 %uint_6 %uint_0 %uint_0
+         %43 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_0
+               OpStore %43 %float_2
+         %44 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+         %47 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_1
+               OpStore %47 %float_2
+         %48 = OpExtInst %void %2 DebugLine %17 %uint_8 %uint_8 %uint_0 %uint_0
+         %50 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_2
+               OpStore %50 %float_2
+         %51 = OpExtInst %void %2 DebugLine %17 %uint_9 %uint_9 %uint_0 %uint_0
+         %60 = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %int_0
+               OpStore %60 %float_4
+         %61 = OpExtInst %void %2 DebugLine %17 %uint_11 %uint_11 %uint_0 %uint_0
+         %90 = OpLoad %int %gl_InvocationID
+        %106 = OpLoad %int %gl_InvocationID
+        %108 = OpAccessChain %_ptr_Input_v4float %gl_in %106 %int_0
+        %109 = OpLoad %v4float %108
+        %111 = OpAccessChain %_ptr_Output_v4float %gl_out %90 %int_0
+               OpStore %111 %109
+               OpReturn
+               OpFunctionEnd
+[TcsInfo]
+entryPoint = main
+
+[TesSpirv]
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 122
+; Schema: 0
+               OpCapability Tessellation
+               OpExtension "SPV_KHR_non_semantic_info"
+          %2 = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
+          %3 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationEvaluation %main "main" %gl_TessCoord %_ %gl_in
+               OpExecutionMode %main Triangles
+               OpExecutionMode %main SpacingFractionalEven
+               OpExecutionMode %main VertexOrderCcw
+          %1 = OpString "test.tese"
+          %8 = OpString "uint"
+         %15 = OpString "main"
+         %18 = OpString "// OpModuleProcessed client vulkan100
+// OpModuleProcessed target-env vulkan1.0
+// OpModuleProcessed entry-point main
+#line 1
+#version 450 core
+layout(triangles,fractional_even_spacing,ccw) in;
+
+void main(void)
+{
+    float u = gl_TessCoord[0];
+    float v = gl_TessCoord[1];
+    float w = gl_TessCoord[2];
+    gl_Position = gl_in[0].gl_Position * u + gl_in[1].gl_Position * v + gl_in[2].gl_Position * w;
+}"
+         %28 = OpString "float"
+         %33 = OpString "u"
+         %41 = OpString "gl_TessCoord"
+         %50 = OpString "v"
+         %57 = OpString "w"
+         %69 = OpString "gl_Position"
+         %72 = OpString "gl_PointSize"
+         %75 = OpString "gl_CullDistance"
+         %79 = OpString "gl_PerVertex"
+         %83 = OpString ""
+         %85 = OpString "int"
+        %102 = OpString "gl_in"
+               OpName %main "main"
+               OpName %u "u"
+               OpName %gl_TessCoord "gl_TessCoord"
+               OpName %v "v"
+               OpName %w "w"
+               OpName %gl_PerVertex "gl_PerVertex"
+               OpMemberName %gl_PerVertex 0 "gl_Position"
+               OpMemberName %gl_PerVertex 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex 3 "gl_CullDistance"
+               OpName %_ ""
+               OpName %gl_PerVertex_0 "gl_PerVertex"
+               OpMemberName %gl_PerVertex_0 0 "gl_Position"
+               OpMemberName %gl_PerVertex_0 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex_0 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex_0 3 "gl_CullDistance"
+               OpName %gl_in "gl_in"
+               OpDecorate %gl_TessCoord BuiltIn TessCoord
+               OpMemberDecorate %gl_PerVertex 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex Block
+               OpMemberDecorate %gl_PerVertex_0 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex_0 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex_0 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex_0 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex_0 Block
+       %void = OpTypeVoid
+          %5 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+    %uint_32 = OpConstant %uint 32
+     %uint_6 = OpConstant %uint 6
+     %uint_0 = OpConstant %uint 0
+          %9 = OpExtInst %void %2 DebugTypeBasic %8 %uint_32 %uint_6 %uint_0
+     %uint_3 = OpConstant %uint 3
+          %6 = OpExtInst %void %2 DebugTypeFunction %uint_3 %void
+         %17 = OpExtInst %void %2 DebugSource %1 %18
+     %uint_1 = OpConstant %uint 1
+     %uint_4 = OpConstant %uint 4
+     %uint_2 = OpConstant %uint 2
+         %19 = OpExtInst %void %2 DebugCompilationUnit %uint_1 %uint_4 %17 %uint_2
+         %16 = OpExtInst %void %2 DebugFunction %15 %6 %17 %uint_0 %uint_0 %19 %15 %uint_3 %uint_0
+      %float = OpTypeFloat 32
+         %29 = OpExtInst %void %2 DebugTypeBasic %28 %uint_32 %uint_3 %uint_0
+%_ptr_Function_float = OpTypePointer Function %float
+         %32 = OpExtInst %void %2 DebugLocalVariable %33 %29 %17 %uint_6 %uint_0 %16 %uint_4
+         %35 = OpExtInst %void %2 DebugExpression
+    %v3float = OpTypeVector %float 3
+         %37 = OpExtInst %void %2 DebugTypeVector %29 %uint_3
+%_ptr_Input_v3float = OpTypePointer Input %v3float
+%gl_TessCoord = OpVariable %_ptr_Input_v3float Input
+     %uint_8 = OpConstant %uint 8
+         %40 = OpExtInst %void %2 DebugGlobalVariable %41 %37 %17 %uint_6 %uint_0 %19 %41 %gl_TessCoord %uint_8
+%_ptr_Input_float = OpTypePointer Input %float
+     %uint_7 = OpConstant %uint 7
+         %49 = OpExtInst %void %2 DebugLocalVariable %50 %29 %17 %uint_7 %uint_0 %16 %uint_4
+         %56 = OpExtInst %void %2 DebugLocalVariable %57 %29 %17 %uint_8 %uint_0 %16 %uint_4
+     %uint_9 = OpConstant %uint 9
+    %v4float = OpTypeVector %float 4
+         %64 = OpExtInst %void %2 DebugTypeVector %29 %uint_4
+%_arr_float_uint_1 = OpTypeArray %float %uint_1
+         %66 = OpExtInst %void %2 DebugTypeArray %29 %uint_1
+%gl_PerVertex = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+   %uint_165 = OpConstant %uint 165
+         %68 = OpExtInst %void %2 DebugTypeMember %69 %64 %17 %uint_1 %uint_165 %uint_0 %uint_0 %uint_3
+   %uint_183 = OpConstant %uint 183
+         %71 = OpExtInst %void %2 DebugTypeMember %72 %29 %17 %uint_1 %uint_183 %uint_0 %uint_0 %uint_3
+   %uint_226 = OpConstant %uint 226
+         %74 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_226 %uint_0 %uint_0 %uint_3
+         %77 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_226 %uint_0 %uint_0 %uint_3
+         %78 = OpExtInst %void %2 DebugTypeComposite %79 %uint_1 %17 %uint_9 %uint_0 %19 %79 %uint_0 %uint_3 %68 %71 %74 %77
+%_ptr_Output_gl_PerVertex = OpTypePointer Output %gl_PerVertex
+          %_ = OpVariable %_ptr_Output_gl_PerVertex Output
+         %82 = OpExtInst %void %2 DebugGlobalVariable %83 %78 %17 %uint_9 %uint_0 %19 %83 %_ %uint_8
+        %int = OpTypeInt 32 1
+         %86 = OpExtInst %void %2 DebugTypeBasic %85 %uint_32 %uint_4 %uint_0
+      %int_0 = OpConstant %int 0
+%gl_PerVertex_0 = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+  %uint_1756 = OpConstant %uint 1756
+         %89 = OpExtInst %void %2 DebugTypeMember %69 %64 %17 %uint_1 %uint_1756 %uint_0 %uint_0 %uint_3
+  %uint_1774 = OpConstant %uint 1774
+         %91 = OpExtInst %void %2 DebugTypeMember %72 %29 %17 %uint_1 %uint_1774 %uint_0 %uint_0 %uint_3
+  %uint_1817 = OpConstant %uint 1817
+         %93 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_1817 %uint_0 %uint_0 %uint_3
+         %95 = OpExtInst %void %2 DebugTypeMember %75 %66 %17 %uint_1 %uint_1817 %uint_0 %uint_0 %uint_3
+         %96 = OpExtInst %void %2 DebugTypeComposite %79 %uint_1 %17 %uint_9 %uint_0 %19 %79 %uint_0 %uint_3 %89 %91 %93 %95
+%_arr_gl_PerVertex_0_uint_32 = OpTypeArray %gl_PerVertex_0 %uint_32
+         %98 = OpExtInst %void %2 DebugTypeArray %96 %uint_32
+%_ptr_Input__arr_gl_PerVertex_0_uint_32 = OpTypePointer Input %_arr_gl_PerVertex_0_uint_32
+      %gl_in = OpVariable %_ptr_Input__arr_gl_PerVertex_0_uint_32 Input
+        %101 = OpExtInst %void %2 DebugGlobalVariable %102 %98 %17 %uint_9 %uint_0 %19 %102 %gl_in %uint_8
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+      %int_1 = OpConstant %int 1
+      %int_2 = OpConstant %int 2
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+               OpLine %1 4 15
+       %main = OpFunction %void None %5
+         %23 = OpLabel
+          %u = OpVariable %_ptr_Function_float Function
+          %v = OpVariable %_ptr_Function_float Function
+          %w = OpVariable %_ptr_Function_float Function
+         %24 = OpExtInst %void %2 DebugFunctionDefinition %16 %main
+         %25 = OpExtInst %void %2 DebugScope %16
+         %26 = OpExtInst %void %2 DebugLine %17 %uint_6 %uint_6 %uint_0 %uint_0
+         %34 = OpExtInst %void %2 DebugDeclare %32 %u %35
+         %44 = OpAccessChain %_ptr_Input_float %gl_TessCoord %uint_0
+         %45 = OpLoad %float %44
+               OpStore %u %45
+         %46 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+         %51 = OpExtInst %void %2 DebugDeclare %49 %v %35
+         %52 = OpAccessChain %_ptr_Input_float %gl_TessCoord %uint_1
+         %53 = OpLoad %float %52
+               OpStore %v %53
+         %54 = OpExtInst %void %2 DebugLine %17 %uint_8 %uint_8 %uint_0 %uint_0
+         %58 = OpExtInst %void %2 DebugDeclare %56 %w %35
+         %59 = OpAccessChain %_ptr_Input_float %gl_TessCoord %uint_2
+         %60 = OpLoad %float %59
+               OpStore %w %60
+         %61 = OpExtInst %void %2 DebugLine %17 %uint_9 %uint_9 %uint_0 %uint_0
+        %104 = OpAccessChain %_ptr_Input_v4float %gl_in %int_0 %int_0
+        %105 = OpLoad %v4float %104
+        %106 = OpLoad %float %u
+        %107 = OpVectorTimesScalar %v4float %105 %106
+        %109 = OpAccessChain %_ptr_Input_v4float %gl_in %int_1 %int_0
+        %110 = OpLoad %v4float %109
+        %111 = OpLoad %float %v
+        %112 = OpVectorTimesScalar %v4float %110 %111
+        %113 = OpFAdd %v4float %107 %112
+        %115 = OpAccessChain %_ptr_Input_v4float %gl_in %int_2 %int_0
+        %116 = OpLoad %v4float %115
+        %117 = OpLoad %float %w
+        %118 = OpVectorTimesScalar %v4float %116 %117
+        %119 = OpFAdd %v4float %113 %118
+        %121 = OpAccessChain %_ptr_Output_v4float %_ %int_0
+               OpStore %121 %119
+               OpReturn
+               OpFunctionEnd
+
+[TesInfo]
+entryPoint = main
+
+[GsSpirv]
+               ; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 110
+; Schema: 0
+               OpCapability Geometry
+               OpExtension "SPV_KHR_non_semantic_info"
+          %2 = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
+          %3 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Geometry %main "main" %_ %gl_in
+               OpExecutionMode %main Triangles
+               OpExecutionMode %main Invocations 1
+               OpExecutionMode %main OutputTriangleStrip
+               OpExecutionMode %main OutputVertices 3
+          %1 = OpString "test.geom"
+          %8 = OpString "uint"
+         %15 = OpString "main"
+         %18 = OpString "// OpModuleProcessed client vulkan100
+// OpModuleProcessed target-env vulkan1.0
+// OpModuleProcessed entry-point main
+#line 1
+#version 450 core
+layout(triangles) in;
+layout(triangle_strip, max_vertices = 3) out;
+
+void main()
+{
+    for (uint i = 0; i < gl_in.length(); ++i)
+    {
+        gl_Position = gl_in[i].gl_Position;
+        EmitVertex();
+    }
+    EndPrimitive();
+}"
+         %31 = OpString "i"
+         %45 = OpString "bool"
+         %52 = OpString "float"
+         %60 = OpString "gl_Position"
+         %63 = OpString "gl_PointSize"
+         %66 = OpString "gl_CullDistance"
+         %69 = OpString "gl_PerVertex"
+         %73 = OpString ""
+         %76 = OpString "int"
+         %93 = OpString "gl_in"
+               OpName %main "main"
+               OpName %i "i"
+               OpName %gl_PerVertex "gl_PerVertex"
+               OpMemberName %gl_PerVertex 0 "gl_Position"
+               OpMemberName %gl_PerVertex 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex 3 "gl_CullDistance"
+               OpName %_ ""
+               OpName %gl_PerVertex_0 "gl_PerVertex"
+               OpMemberName %gl_PerVertex_0 0 "gl_Position"
+               OpMemberName %gl_PerVertex_0 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex_0 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex_0 3 "gl_CullDistance"
+               OpName %gl_in "gl_in"
+               OpMemberDecorate %gl_PerVertex 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex Block
+               OpMemberDecorate %gl_PerVertex_0 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex_0 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex_0 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex_0 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex_0 Block
+       %void = OpTypeVoid
+          %5 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+    %uint_32 = OpConstant %uint 32
+     %uint_6 = OpConstant %uint 6
+     %uint_0 = OpConstant %uint 0
+          %9 = OpExtInst %void %2 DebugTypeBasic %8 %uint_32 %uint_6 %uint_0
+     %uint_3 = OpConstant %uint 3
+          %6 = OpExtInst %void %2 DebugTypeFunction %uint_3 %void
+         %17 = OpExtInst %void %2 DebugSource %1 %18
+     %uint_1 = OpConstant %uint 1
+     %uint_4 = OpConstant %uint 4
+     %uint_2 = OpConstant %uint 2
+         %19 = OpExtInst %void %2 DebugCompilationUnit %uint_1 %uint_4 %17 %uint_2
+         %16 = OpExtInst %void %2 DebugFunction %15 %6 %17 %uint_0 %uint_0 %19 %15 %uint_3 %uint_0
+     %uint_7 = OpConstant %uint 7
+%_ptr_Function_uint = OpTypePointer Function %uint
+         %30 = OpExtInst %void %2 DebugLocalVariable %31 %9 %17 %uint_7 %uint_0 %16 %uint_4
+         %33 = OpExtInst %void %2 DebugExpression
+       %bool = OpTypeBool
+         %46 = OpExtInst %void %2 DebugTypeBasic %45 %uint_32 %uint_2 %uint_0
+     %uint_9 = OpConstant %uint 9
+      %float = OpTypeFloat 32
+         %53 = OpExtInst %void %2 DebugTypeBasic %52 %uint_32 %uint_3 %uint_0
+    %v4float = OpTypeVector %float 4
+         %55 = OpExtInst %void %2 DebugTypeVector %53 %uint_4
+%_arr_float_uint_1 = OpTypeArray %float %uint_1
+         %57 = OpExtInst %void %2 DebugTypeArray %53 %uint_1
+%gl_PerVertex = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+   %uint_215 = OpConstant %uint 215
+         %59 = OpExtInst %void %2 DebugTypeMember %60 %55 %17 %uint_2 %uint_215 %uint_0 %uint_0 %uint_3
+   %uint_233 = OpConstant %uint 233
+         %62 = OpExtInst %void %2 DebugTypeMember %63 %53 %17 %uint_2 %uint_233 %uint_0 %uint_0 %uint_3
+         %65 = OpExtInst %void %2 DebugTypeMember %66 %57 %17 %uint_3 %uint_7 %uint_0 %uint_0 %uint_3
+         %67 = OpExtInst %void %2 DebugTypeMember %66 %57 %17 %uint_3 %uint_7 %uint_0 %uint_0 %uint_3
+         %68 = OpExtInst %void %2 DebugTypeComposite %69 %uint_1 %17 %uint_9 %uint_0 %19 %69 %uint_0 %uint_3 %59 %62 %65 %67
+%_ptr_Output_gl_PerVertex = OpTypePointer Output %gl_PerVertex
+          %_ = OpVariable %_ptr_Output_gl_PerVertex Output
+     %uint_8 = OpConstant %uint 8
+         %72 = OpExtInst %void %2 DebugGlobalVariable %73 %68 %17 %uint_9 %uint_0 %19 %73 %_ %uint_8
+        %int = OpTypeInt 32 1
+         %77 = OpExtInst %void %2 DebugTypeBasic %76 %uint_32 %uint_4 %uint_0
+      %int_0 = OpConstant %int 0
+%gl_PerVertex_0 = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+    %uint_23 = OpConstant %uint 23
+         %80 = OpExtInst %void %2 DebugTypeMember %60 %55 %17 %uint_2 %uint_23 %uint_0 %uint_0 %uint_3
+    %uint_41 = OpConstant %uint 41
+         %82 = OpExtInst %void %2 DebugTypeMember %63 %53 %17 %uint_2 %uint_41 %uint_0 %uint_0 %uint_3
+    %uint_84 = OpConstant %uint 84
+         %84 = OpExtInst %void %2 DebugTypeMember %66 %57 %17 %uint_2 %uint_84 %uint_0 %uint_0 %uint_3
+         %86 = OpExtInst %void %2 DebugTypeMember %66 %57 %17 %uint_2 %uint_84 %uint_0 %uint_0 %uint_3
+         %87 = OpExtInst %void %2 DebugTypeComposite %69 %uint_1 %17 %uint_9 %uint_0 %19 %69 %uint_0 %uint_3 %80 %82 %84 %86
+%_arr_gl_PerVertex_0_uint_3 = OpTypeArray %gl_PerVertex_0 %uint_3
+         %89 = OpExtInst %void %2 DebugTypeArray %87 %uint_3
+%_ptr_Input__arr_gl_PerVertex_0_uint_3 = OpTypePointer Input %_arr_gl_PerVertex_0_uint_3
+      %gl_in = OpVariable %_ptr_Input__arr_gl_PerVertex_0_uint_3 Input
+         %92 = OpExtInst %void %2 DebugGlobalVariable %93 %89 %17 %uint_9 %uint_0 %19 %93 %gl_in %uint_8
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+    %uint_10 = OpConstant %uint 10
+      %int_1 = OpConstant %int 1
+    %uint_12 = OpConstant %uint 12
+               OpLine %1 5 11
+       %main = OpFunction %void None %5
+         %23 = OpLabel
+          %i = OpVariable %_ptr_Function_uint Function
+         %24 = OpExtInst %void %2 DebugFunctionDefinition %16 %main
+         %25 = OpExtInst %void %2 DebugScope %16
+         %26 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+         %32 = OpExtInst %void %2 DebugDeclare %30 %i %33
+               OpStore %i %uint_0
+               OpBranch %34
+         %34 = OpLabel
+         %38 = OpExtInst %void %2 DebugScope %16
+         %39 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+               OpLoopMerge %36 %37 None
+               OpBranch %40
+         %40 = OpLabel
+         %41 = OpExtInst %void %2 DebugScope %16
+         %42 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+         %43 = OpLoad %uint %i
+         %47 = OpULessThan %bool %43 %uint_3
+               OpBranchConditional %47 %35 %36
+         %35 = OpLabel
+         %48 = OpExtInst %void %2 DebugScope %16
+         %49 = OpExtInst %void %2 DebugLine %17 %uint_9 %uint_9 %uint_0 %uint_0
+         %94 = OpLoad %uint %i
+         %96 = OpAccessChain %_ptr_Input_v4float %gl_in %94 %int_0
+         %97 = OpLoad %v4float %96
+         %99 = OpAccessChain %_ptr_Output_v4float %_ %int_0
+               OpStore %99 %97
+        %100 = OpExtInst %void %2 DebugLine %17 %uint_10 %uint_10 %uint_0 %uint_0
+               OpEmitVertex
+               OpBranch %37
+         %37 = OpLabel
+        %102 = OpExtInst %void %2 DebugScope %16
+        %103 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+        %104 = OpLoad %uint %i
+        %106 = OpIAdd %uint %104 %int_1
+               OpStore %i %106
+               OpBranch %34
+         %36 = OpLabel
+        %107 = OpExtInst %void %2 DebugScope %16
+        %108 = OpExtInst %void %2 DebugLine %17 %uint_12 %uint_12 %uint_0 %uint_0
+               OpEndPrimitive
+               OpReturn
+               OpFunctionEnd
+
+[GsInfo]
+entryPoint = main
+
+[GraphicsPipelineState]
+topology = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST
+patchControlPoints = 3
+
+[VertexInputState]
+binding[0].binding = 0
+binding[0].stride = 16
+binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+attribute[0].location = 0
+attribute[0].binding = 0
+attribute[0].format = VK_FORMAT_R32G32B32A32_SFLOAT
+attribute[0].offset = 0
+attribute[1].location = 1
+attribute[1].binding = 0
+attribute[1].format = VK_FORMAT_R32G32_SFLOAT
+attribute[1].offset = 0
+attribute[2].location = 2
+attribute[2].binding = 0
+attribute[2].format = VK_FORMAT_R32G32_SFLOAT
+attribute[2].offset = 0
+;.
+; SHADERTEST: @[[LDS_GS:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(3) global [1250 x i32], align 4
+; SHADERTEST: @[[LDS_HS:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(3) global [1152 x i32], align 4
+;.
+; SHADERTEST-LABEL: @_amdgpu_hs_main(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.init.exec(i64 -1)
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; SHADERTEST-NEXT:    [[THREADIDINWAVE:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP12]])
+; SHADERTEST-NEXT:    [[LSVERTCOUNT:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEWAVEINFO:%.*]], i32 0, i32 8)
+; SHADERTEST-NEXT:    [[HSVERTCOUNT:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEWAVEINFO]], i32 8, i32 8)
+; SHADERTEST-NEXT:    [[VALIDLSVERT:%.*]] = icmp ult i32 [[THREADIDINWAVE]], [[LSVERTCOUNT]]
+; SHADERTEST-NEXT:    br i1 [[VALIDLSVERT]], label [[DOTBEGINLS:%.*]], label [[DOTENDLS:%.*]]
+; SHADERTEST:       .beginLs:
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5:%.*]], i64 0
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i64 1
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP5]], i64 2
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i64 3
+; SHADERTEST-NEXT:    call amdgpu_ls void @_amdgpu_ls_main(i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP15]], i32 [[TMP16]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 poison, i32 [[TMP11:%.*]]), !dbg [[DBG100:![0-9]+]]
+; SHADERTEST-NEXT:    br label [[DOTENDLS]]
+; SHADERTEST:       .endLs:
+; SHADERTEST-NEXT:    fence syncscope("workgroup") release
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.barrier()
+; SHADERTEST-NEXT:    fence syncscope("workgroup") acquire
+; SHADERTEST-NEXT:    [[VALIDHSVERT:%.*]] = icmp ult i32 [[THREADIDINWAVE]], [[HSVERTCOUNT]]
+; SHADERTEST-NEXT:    br i1 [[VALIDHSVERT]], label [[DOTBEGINHS:%.*]], label [[DOTENDHS:%.*]]
+; SHADERTEST:       .beginHs:
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP5]], i64 0
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP5]], i64 1
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP5]], i64 2
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP5]], i64 3
+; SHADERTEST-NEXT:    call amdgpu_hs void @_amdgpu_hs_main.1(i32 [[TMP17]], i32 [[TMP18]], i32 [[TMP19]], i32 [[TMP20]], i32 [[OFFCHIPLDSBASE:%.*]], i32 [[TFBUFFERBASE:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]]), !dbg [[DBG100]]
+; SHADERTEST-NEXT:    br label [[DOTENDHS]]
+; SHADERTEST:       .endHs:
+; SHADERTEST-NEXT:    ret void
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_gs_main(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.init.exec(i64 -1)
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; SHADERTEST-NEXT:    [[THREADIDINWAVE:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP15]])
+; SHADERTEST-NEXT:    [[ESVERTCOUNT:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO:%.*]], i32 0, i32 8)
+; SHADERTEST-NEXT:    [[GSPRIMCOUNT:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO]], i32 8, i32 8)
+; SHADERTEST-NEXT:    [[GSWAVEID:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO]], i32 16, i32 8)
+; SHADERTEST-NEXT:    [[WAVEINSUBGROUP:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO]], i32 24, i32 4)
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = mul i32 [[WAVEINSUBGROUP]], 320
+; SHADERTEST-NEXT:    [[VALIDESVERT:%.*]] = icmp ult i32 [[THREADIDINWAVE]], [[ESVERTCOUNT]]
+; SHADERTEST-NEXT:    br i1 [[VALIDESVERT]], label [[DOTBEGINES:%.*]], label [[DOTENDES:%.*]]
+; SHADERTEST:       .beginEs:
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = extractelement <1 x i32> [[TMP5:%.*]], i64 0
+; SHADERTEST-NEXT:    call amdgpu_es void @_amdgpu_es_main(i32 [[TMP17]], i32 [[OFFCHIPLDSBASE:%.*]], i32 [[TMP16]], float [[TMP11:%.*]], float [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]]), !dbg [[DBG103:![0-9]+]]
+; SHADERTEST-NEXT:    br label [[DOTENDES]]
+; SHADERTEST:       .endEs:
+; SHADERTEST-NEXT:    fence syncscope("workgroup") release
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.barrier()
+; SHADERTEST-NEXT:    fence syncscope("workgroup") acquire
+; SHADERTEST-NEXT:    [[VALIDGSPRIM:%.*]] = icmp ult i32 [[THREADIDINWAVE]], [[GSPRIMCOUNT]]
+; SHADERTEST-NEXT:    br i1 [[VALIDGSPRIM]], label [[DOTBEGINGS:%.*]], label [[DOTENDGS:%.*]]
+; SHADERTEST:       .beginGs:
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[TMP6:%.*]], i32 0, i32 16)
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[TMP6]], i32 16, i32 16)
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[TMP7:%.*]], i32 0, i32 16)
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[TMP7]], i32 16, i32 16)
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 0, i32 16)
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 16, i32 16)
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = extractelement <1 x i32> [[TMP5]], i64 0
+; SHADERTEST-NEXT:    call amdgpu_gs void @_amdgpu_gs_main.2(i32 [[TMP24]], i32 [[GSVSOFFSET:%.*]], i32 [[GSWAVEID]], i32 [[TMP18]], i32 [[TMP19]], i32 [[TMP8:%.*]], i32 [[TMP20]], i32 [[TMP21]], i32 [[TMP22]], i32 [[TMP23]], i32 [[TMP9:%.*]]), !dbg [[DBG103]]
+; SHADERTEST-NEXT:    br label [[DOTENDGS]]
+; SHADERTEST:       .endGs:
+; SHADERTEST-NEXT:    ret void
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_ls_main(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG108:![0-9]+]]
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_hs_main.1(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = and i32 [[RELPATCHID:%.*]], 255
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7:[0-9]+]]
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[GLOBALTABLE:%.*]], i32 0
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 144
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP6]], align 16
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], -4294967296
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = zext i32 [[GLOBALTABLE]] to i64
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[TMP9]], [[TMP10]]
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP12]], i64 160
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP13]], align 16
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = and i32 [[RELPATCHID]], 255
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = lshr i32 [[RELPATCHID]], 8
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = and i32 [[TMP16]], 31
+; SHADERTEST-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i32 [[TMP15]], 24, !dbg [[DBG111:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.HS, i32 [[DOTIDX]], !dbg [[DBG111]]
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 3072, !dbg [[DBG111]]
+; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP19]], align 4, !dbg [[DBG111]]
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 3076, !dbg [[DBG112:![0-9]+]]
+; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP20]], align 4, !dbg [[DBG112]]
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 3080, !dbg [[DBG113:![0-9]+]]
+; SHADERTEST-NEXT:    store i32 1073741824, ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG113]]
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 3088, !dbg [[DBG114:![0-9]+]]
+; SHADERTEST-NEXT:    store i32 1082130432, ptr addrspace(3) [[TMP22]], align 4, !dbg [[DBG114]]
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = mul nuw nsw i32 [[TMP15]], 3, !dbg [[DBG115:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = add nuw nsw i32 [[TMP23]], [[TMP17]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP25:%.*]] = shl nuw nsw i32 [[TMP24]], 2, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP25]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(3) [[TMP26]], align 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP28:%.*]] = or disjoint i32 [[TMP25]], 1, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP28]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(3) [[TMP29]], align 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP31:%.*]] = or disjoint i32 [[TMP25]], 2, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP31]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(3) [[TMP32]], align 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP34:%.*]] = or disjoint i32 [[TMP25]], 3, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP34]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(3) [[TMP35]], align 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP37:%.*]] = shl nuw nsw i32 [[TMP17]], 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP38:%.*]] = mul nuw nsw i32 [[TMP15]], 48, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP39:%.*]] = add nuw nsw i32 [[TMP37]], [[TMP38]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[DOTUPTO010:%.*]] = insertelement <4 x i32> poison, i32 [[TMP27]], i64 0, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[DOTUPTO111:%.*]] = insertelement <4 x i32> [[DOTUPTO010]], i32 [[TMP30]], i64 1, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[DOTUPTO212:%.*]] = insertelement <4 x i32> [[DOTUPTO111]], i32 [[TMP33]], i64 2, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[DOTUPTO212]], i32 [[TMP36]], i64 3, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> [[TMP40]], <4 x i32> [[TMP14]], i32 [[TMP39]], i32 [[OFFCHIPLDSBASE:%.*]], i32 immarg 77, i32 immarg 1) #[[ATTR12:[0-9]+]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    fence syncscope("workgroup") release, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.barrier(), !dbg [[DBG115]]
+; SHADERTEST-NEXT:    fence syncscope("workgroup") acquire, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP41:%.*]] = mul i32 [[TMP0]], 6, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP42:%.*]] = add i32 [[TMP41]], 768, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP42]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP44:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP43]], align 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP45:%.*]] = mul i32 [[TMP0]], 6, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP46:%.*]] = add i32 [[TMP45]], 772, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.HS, i32 [[TMP46]], !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP48:%.*]] = load <1 x float>, ptr addrspace(3) [[TMP47]], align 4, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP49:%.*]] = mul i32 [[TMP0]], 16, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP50:%.*]] = shufflevector <3 x float> [[TMP44]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP51:%.*]] = extractelement <1 x float> [[TMP48]], i64 0, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP51]], i64 3, !dbg [[DBG115]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> [[TMP52]], <4 x i32> [[TMP7]], i32 [[TMP49]], i32 [[TFBUFFERBASE:%.*]], i32 77, i32 1), !dbg [[DBG115]]
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG115]]
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_es_main(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = fadd float [[TESSCOORDX:%.*]], [[TESSCOORDY:%.*]]
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = fsub float 1.000000e+00, [[TMP0]], !dbg [[DBG121:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -4294967296
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = zext i32 [[GLOBALTABLE:%.*]] to i64
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP3]], [[TMP4]]
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i64 160
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP7]], align 16
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP9]])
+; SHADERTEST-NEXT:    #dbg_value(float [[TESSCOORDX]], !118, !DIExpression(), !122)
+; SHADERTEST-NEXT:    #dbg_value(float [[TESSCOORDY]], !119, !DIExpression(), !122)
+; SHADERTEST-NEXT:    #dbg_value(float [[TMP1]], !120, !DIExpression(), !122)
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = mul i32 [[RELPATCHID:%.*]], 48, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> [[TMP8]], i32 [[TMP11]], i32 [[OFFCHIPLDSBASE:%.*]], i32 immarg 77, i32 immarg 5) #[[ATTR8:[0-9]+]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI07:%.*]] = extractelement <4 x float> [[BC]], i64 0, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC56:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI18:%.*]] = extractelement <4 x float> [[BC56]], i64 1, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC57:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI29:%.*]] = extractelement <4 x float> [[BC57]], i64 2, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC58:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI310:%.*]] = extractelement <4 x float> [[BC58]], i64 3, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE_I0:%.*]] = fmul nnan nsz float [[TESSCOORDX]], [[DOTI07]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE_I1:%.*]] = fmul nnan nsz float [[TESSCOORDX]], [[DOTI18]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE_I2:%.*]] = fmul nnan nsz float [[TESSCOORDX]], [[DOTI29]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE_I3:%.*]] = fmul nnan nsz float [[TESSCOORDX]], [[DOTI310]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], 16, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> [[TMP8]], i32 [[TMP13]], i32 [[OFFCHIPLDSBASE]], i32 immarg 77, i32 immarg 5) #[[ATTR8]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC59:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI012:%.*]] = extractelement <4 x float> [[BC59]], i64 0, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC60:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI114:%.*]] = extractelement <4 x float> [[BC60]], i64 1, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC61:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI216:%.*]] = extractelement <4 x float> [[BC61]], i64 2, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC62:%.*]] = bitcast <4 x i32> [[TMP14]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI318:%.*]] = extractelement <4 x float> [[BC62]], i64 3, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE2_I0:%.*]] = fmul nnan nsz float [[TESSCOORDY]], [[DOTI012]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE2_I1:%.*]] = fmul nnan nsz float [[TESSCOORDY]], [[DOTI114]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE2_I2:%.*]] = fmul nnan nsz float [[TESSCOORDY]], [[DOTI216]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE2_I3:%.*]] = fmul nnan nsz float [[TESSCOORDY]], [[DOTI318]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI019:%.*]] = fadd nnan nsz float [[SCALE_I0]], [[SCALE2_I0]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI120:%.*]] = fadd nnan nsz float [[SCALE_I1]], [[SCALE2_I1]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI221:%.*]] = fadd nnan nsz float [[SCALE_I2]], [[SCALE2_I2]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI322:%.*]] = fadd nnan nsz float [[SCALE_I3]], [[SCALE2_I3]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = add i32 [[TMP11]], 32, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> [[TMP8]], i32 [[TMP15]], i32 [[OFFCHIPLDSBASE]], i32 immarg 77, i32 immarg 5) #[[ATTR8]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC63:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI024:%.*]] = extractelement <4 x float> [[BC63]], i64 0, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC64:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI126:%.*]] = extractelement <4 x float> [[BC64]], i64 1, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC65:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI228:%.*]] = extractelement <4 x float> [[BC65]], i64 2, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[BC66:%.*]] = bitcast <4 x i32> [[TMP16]] to <4 x float>, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI330:%.*]] = extractelement <4 x float> [[BC66]], i64 3, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE4_I0:%.*]] = fmul nnan nsz float [[TMP1]], [[DOTI024]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE4_I1:%.*]] = fmul nnan nsz float [[TMP1]], [[DOTI126]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE4_I2:%.*]] = fmul nnan nsz float [[TMP1]], [[DOTI228]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[SCALE4_I3:%.*]] = fmul nnan nsz float [[TMP1]], [[DOTI330]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI031:%.*]] = fadd nnan nsz float [[DOTI019]], [[SCALE4_I0]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI132:%.*]] = fadd nnan nsz float [[DOTI120]], [[SCALE4_I1]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI233:%.*]] = fadd nnan nsz float [[DOTI221]], [[SCALE4_I2]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTI334:%.*]] = fadd nnan nsz float [[DOTI322]], [[SCALE4_I3]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[DOTIDX:%.*]] = mul i32 [[TMP10]], 20, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.GS, i32 [[DOTIDX]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr addrspace(3) [[TMP17]], i32 [[ESGSOFFSET:%.*]], !dbg [[DBG121]]
+; SHADERTEST-NEXT:    store float [[DOTI031]], ptr addrspace(3) [[TMP18]], align 4, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 4, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    store float [[DOTI132]], ptr addrspace(3) [[TMP19]], align 4, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 8, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    store float [[DOTI233]], ptr addrspace(3) [[TMP20]], align 4, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP18]], i32 12, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    store float [[DOTI334]], ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG121]]
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG121]]
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_gs_main.2(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    #dbg_value(i32 0, !125, !DIExpression(), !128)
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET2:%.*]], !dbg [[DBG129:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 12, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 8, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(3) [[TMP0]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET1:%.*]], !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 12, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 8, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(3) [[TMP8]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET0:%.*]], !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 12, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(3) [[TMP17]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 8, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(3) [[TMP19]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP16]], i32 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(3) [[TMP21]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(3) [[TMP16]], align 4, !dbg [[DBG129]]
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
+; SHADERTEST-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], -4294967296
+; SHADERTEST-NEXT:    [[TMP26:%.*]] = zext i32 [[GLOBALTABLE:%.*]] to i64
+; SHADERTEST-NEXT:    [[TMP27:%.*]] = or disjoint i64 [[TMP25]], [[TMP26]]
+; SHADERTEST-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP28]], i64 64
+; SHADERTEST-NEXT:    [[TMP30:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP29]], align 16
+; SHADERTEST-NEXT:    [[DOTI1:%.*]] = extractelement <4 x i32> [[TMP30]], i64 1
+; SHADERTEST-NEXT:    [[TMP31:%.*]] = and i32 [[DOTI1]], -1073676289
+; SHADERTEST-NEXT:    [[TMP32:%.*]] = or disjoint i32 [[TMP31]], 3145728
+; SHADERTEST-NEXT:    [[DOTUPTO19:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP32]], i64 1
+; SHADERTEST-NEXT:    [[DOTI2:%.*]] = extractelement <4 x i32> [[TMP30]], i64 2
+; SHADERTEST-NEXT:    [[DOTUPTO210:%.*]] = insertelement <4 x i32> [[DOTUPTO19]], i32 [[DOTI2]], i64 2
+; SHADERTEST-NEXT:    [[DOTI3:%.*]] = extractelement <4 x i32> [[TMP30]], i64 3
+; SHADERTEST-NEXT:    [[TMP33:%.*]] = and i32 [[DOTI3]], -491521
+; SHADERTEST-NEXT:    [[TMP34:%.*]] = or disjoint i32 [[TMP33]], 131072
+; SHADERTEST-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> [[DOTUPTO210]], i32 [[TMP34]], i64 3
+; SHADERTEST-NEXT:    #dbg_value(i32 0, !125, !DIExpression(), !128)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP23]], <4 x i32> [[TMP35]], i32 0, i32 [[GSVSOFFSET:%.*]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP22]], <4 x i32> [[TMP35]], i32 12, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP20]], <4 x i32> [[TMP35]], i32 24, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP18]], <4 x i32> [[TMP35]], i32 36, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID:%.*]]), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    #dbg_value(i32 1, !125, !DIExpression(), !128)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP15]], <4 x i32> [[TMP35]], i32 4, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP14]], <4 x i32> [[TMP35]], i32 16, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP12]], <4 x i32> [[TMP35]], i32 28, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP10]], <4 x i32> [[TMP35]], i32 40, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    #dbg_value(i32 2, !125, !DIExpression(), !128)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP7]], <4 x i32> [[TMP35]], i32 8, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP6]], <4 x i32> [[TMP35]], i32 20, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP4]], <4 x i32> [[TMP35]], i32 32, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 [[TMP2]], <4 x i32> [[TMP35]], i32 44, i32 [[GSVSOFFSET]], i32 20, i32 11), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    #dbg_value(i32 3, !125, !DIExpression(), !128)
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 18, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    fence syncscope("agent") release, !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[GSWAVEID]]), !dbg !{{[0-9]+}}
+; SHADERTEST-NEXT:    ret void, !dbg !{{[0-9]+}}
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_vs_main(
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = call noundef i64 @llvm.amdgcn.s.getpc() #[[ATTR7]]
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = zext i32 [[GLOBALTABLE:%.*]] to i64
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP2]], [[TMP3]]
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i64 128
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP6]], align 16, !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = shl i32 [[VERTEXOFFSET:%.*]], 2
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[TMP7]], i32 [[TMP8]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], 192
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[TMP7]], i32 [[TMP10]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = add i32 [[TMP8]], 384
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[TMP7]], i32 [[TMP12]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = add i32 [[TMP8]], 576
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[TMP7]], i32 [[TMP14]], i32 0, i32 3), !invariant.load !98
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float [[TMP9]], float [[TMP11]], float [[TMP13]], float [[TMP15]], i1 true, i1 false)
+; SHADERTEST-NEXT:    ret void
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_ps_main(
+; SHADERTEST-NEXT:    ret void
+;
+;.
+; SHADERTEST: attributes #[[ATTR0:[0-9]+]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR1:[0-9]+]] = { alwaysinline nounwind memory(readwrite) "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR3:[0-9]+]] = { nounwind }
+; SHADERTEST: attributes #[[ATTR4:[0-9]+]] = { nounwind willreturn memory(read) }
+; SHADERTEST: attributes #[[ATTR5:[0-9]+]] = { "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR6:[0-9]+]] = { memory(readwrite) "InitialPSInputAddr"="0" "amdgpu-color-export"="0" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR7]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR8]] = { nocallback nofree nosync nounwind willreturn memory(read) }
+; SHADERTEST: attributes #[[ATTR9:[0-9]+]] = { nounwind memory(none) }
+; SHADERTEST: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR11:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; SHADERTEST: attributes #[[ATTR12]] = { nocallback nofree nosync nounwind willreturn memory(write) }
+; SHADERTEST: attributes #[[ATTR13:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; SHADERTEST: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !2)
+; SHADERTEST: [[META1:![0-9]+]] = !DIFile(filename: "test.vert", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 450\0Alayout(location = 0) in vec4 position
+; SHADERTEST: [[META2:![0-9]+]] = !{!3, !9}
+; SHADERTEST: [[META3:![0-9]+]] = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+; SHADERTEST: [[META4:![0-9]+]] = distinct !DIGlobalVariable(name: "positionOut", linkageName: "positionOut", scope: !0, file: !1, line: 7, type: !5, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META5:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 128, flags: DIFlagVector, elements: !7)
+; SHADERTEST: [[META6:![0-9]+]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+; SHADERTEST: [[META7:![0-9]+]] = !{!8}
+; SHADERTEST: [[META8:![0-9]+]] = !DISubrange(count: 4, lowerBound: 0)
+; SHADERTEST: [[META9:![0-9]+]] = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+; SHADERTEST: [[META10:![0-9]+]] = distinct !DIGlobalVariable(name: "position", linkageName: "position", scope: !0, file: !1, line: 7, type: !5, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META11:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !12, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !13)
+; SHADERTEST: [[META12:![0-9]+]] = !DIFile(filename: "test.tesc", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 450 core\0Alayout(vertices=3) out
+; SHADERTEST: [[META13:![0-9]+]] = !{!14, !17, !22, !35, !38}
+; SHADERTEST: [[META14:![0-9]+]] = !DIGlobalVariableExpression(var: !15, expr: !DIExpression())
+; SHADERTEST: [[META15:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_TessLevelOuter", linkageName: "gl_TessLevelOuter", scope: !11, file: !12, line: 6, type: !16, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META16:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 128, elements: !7)
+; SHADERTEST: [[META17:![0-9]+]] = !DIGlobalVariableExpression(var: !18, expr: !DIExpression())
+; SHADERTEST: [[META18:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_TessLevelInner", linkageName: "gl_TessLevelInner", scope: !11, file: !12, line: 9, type: !19, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META19:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 64, elements: !20)
+; SHADERTEST: [[META20:![0-9]+]] = !{!21}
+; SHADERTEST: [[META21:![0-9]+]] = !DISubrange(count: 2, lowerBound: 0)
+; SHADERTEST: [[META22:![0-9]+]] = !DIGlobalVariableExpression(var: !23, expr: !DIExpression())
+; SHADERTEST: [[META23:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_out", linkageName: "gl_out", scope: !11, file: !12, line: 11, type: !24, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META24:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !25, elements: !33)
+; SHADERTEST: [[META25:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !12, line: 11, flags: DIFlagPublic, elements: !26, identifier: "gl_PerVertex")
+; SHADERTEST: [[META26:![0-9]+]] = !{!27, !28, !29, !29}
+; SHADERTEST: [[META27:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_Position", file: !12, line: 1, baseType: !5, flags: DIFlagPublic)
+; SHADERTEST: [[META28:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_PointSize", file: !12, line: 1, baseType: !6, flags: DIFlagPublic)
+; SHADERTEST: [[META29:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_CullDistance", file: !12, line: 1, baseType: !30, flags: DIFlagPublic)
+; SHADERTEST: [[META30:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 32, elements: !31)
+; SHADERTEST: [[META31:![0-9]+]] = !{!32}
+; SHADERTEST: [[META32:![0-9]+]] = !DISubrange(count: 1, lowerBound: 0)
+; SHADERTEST: [[META33:![0-9]+]] = !{!34}
+; SHADERTEST: [[META34:![0-9]+]] = !DISubrange(count: 3, lowerBound: 0)
+; SHADERTEST: [[META35:![0-9]+]] = !DIGlobalVariableExpression(var: !36, expr: !DIExpression())
+; SHADERTEST: [[META36:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_InvocationID", linkageName: "gl_InvocationID", scope: !11, file: !12, line: 11, type: !37, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META37:![0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; SHADERTEST: [[META38:![0-9]+]] = !DIGlobalVariableExpression(var: !39, expr: !DIExpression())
+; SHADERTEST: [[META39:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_in", linkageName: "gl_in", scope: !11, file: !12, line: 11, type: !40, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META40:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !41, elements: !42)
+; SHADERTEST: [[META41:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !12, line: 11, flags: DIFlagPublic, elements: !26, identifier: "gl_PerVertex")
+; SHADERTEST: [[META42:![0-9]+]] = !{!43}
+; SHADERTEST: [[META43:![0-9]+]] = !DISubrange(count: 32, lowerBound: 0)
+; SHADERTEST: [[META44:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !45, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !46)
+; SHADERTEST: [[META45:![0-9]+]] = !DIFile(filename: "test.tese", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 450 core\0Alayout(triangles,fractional_even_spacing,ccw) in
+; SHADERTEST: [[META46:![0-9]+]] = !{!47, !50, !57}
+; SHADERTEST: [[META47:![0-9]+]] = !DIGlobalVariableExpression(var: !48, expr: !DIExpression())
+; SHADERTEST: [[META48:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_TessCoord", linkageName: "gl_TessCoord", scope: !44, file: !45, line: 6, type: !49, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META49:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 96, flags: DIFlagVector, elements: !33)
+; SHADERTEST: [[META50:![0-9]+]] = !DIGlobalVariableExpression(var: !51, expr: !DIExpression())
+; SHADERTEST: [[META51:![0-9]+]] = distinct !DIGlobalVariable(scope: !44, file: !45, line: 9, type: !52, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META52:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !45, line: 9, flags: DIFlagPublic, elements: !53, identifier: "gl_PerVertex")
+; SHADERTEST: [[META53:![0-9]+]] = !{!54, !55, !56, !56}
+; SHADERTEST: [[META54:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_Position", file: !45, line: 1, baseType: !5, flags: DIFlagPublic)
+; SHADERTEST: [[META55:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_PointSize", file: !45, line: 1, baseType: !6, flags: DIFlagPublic)
+; SHADERTEST: [[META56:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_CullDistance", file: !45, line: 1, baseType: !30, flags: DIFlagPublic)
+; SHADERTEST: [[META57:![0-9]+]] = !DIGlobalVariableExpression(var: !58, expr: !DIExpression())
+; SHADERTEST: [[META58:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_in", linkageName: "gl_in", scope: !44, file: !45, line: 9, type: !59, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META59:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !60, elements: !42)
+; SHADERTEST: [[META60:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !45, line: 9, flags: DIFlagPublic, elements: !53, identifier: "gl_PerVertex")
+; SHADERTEST: [[META61:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !62, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !63)
+; SHADERTEST: [[META62:![0-9]+]] = !DIFile(filename: "test.geom", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 450 core\0Alayout(triangles) in
+; SHADERTEST: [[META63:![0-9]+]] = !{!64, !71}
+; SHADERTEST: [[META64:![0-9]+]] = !DIGlobalVariableExpression(var: !65, expr: !DIExpression())
+; SHADERTEST: [[META65:![0-9]+]] = distinct !DIGlobalVariable(scope: !61, file: !62, line: 9, type: !66, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META66:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !62, line: 9, flags: DIFlagPublic, elements: !67, identifier: "gl_PerVertex")
+; SHADERTEST: [[META67:![0-9]+]] = !{!68, !69, !70, !70}
+; SHADERTEST: [[META68:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_Position", file: !62, line: 2, baseType: !5, flags: DIFlagPublic)
+; SHADERTEST: [[META69:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_PointSize", file: !62, line: 2, baseType: !6, flags: DIFlagPublic)
+; SHADERTEST: [[META70:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_CullDistance", file: !62, line: 3, baseType: !30, flags: DIFlagPublic)
+; SHADERTEST: [[META71:![0-9]+]] = !DIGlobalVariableExpression(var: !72, expr: !DIExpression())
+; SHADERTEST: [[META72:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_in", linkageName: "gl_in", scope: !61, file: !62, line: 9, type: !73, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META73:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !74, elements: !33)
+; SHADERTEST: [[META74:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !62, line: 9, flags: DIFlagPublic, elements: !75, identifier: "gl_PerVertex")
+; SHADERTEST: [[META75:![0-9]+]] = !{!68, !69, !76, !76}
+; SHADERTEST: [[META76:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_CullDistance", file: !62, line: 2, baseType: !30, flags: DIFlagPublic)
+; SHADERTEST: [[META77:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !78, producer: "lgc", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+; SHADERTEST: [[META78:![0-9]+]] = !DIFile(filename: "internal", directory: "")
+; SHADERTEST: [[META79:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !78, producer: "lgc", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+; SHADERTEST: [[META80:![0-9]+]] = !{!"Vulkan"}
+; SHADERTEST: [[META81:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 2, i32 1
+; SHADERTEST: [[META82:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+; SHADERTEST: [[META83:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+; SHADERTEST: [[META84:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+; SHADERTEST: [[META85:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+; SHADERTEST: [[META86:![0-9]+]] = !{i32 0, i32 0, i32 0, i32 16, i32 14, i32 7}
+; SHADERTEST: [[META87:![0-9]+]] = !{i32 1, i32 0, i32 0, i32 16, i32 11, i32 7}
+; SHADERTEST: [[META88:![0-9]+]] = !{i32 2, i32 0, i32 0, i32 16, i32 11, i32 7}
+; SHADERTEST: [[META89:![0-9]+]] = !{i32 10}
+; SHADERTEST: [[META90:![0-9]+]] = !{!"\82\B0amdpal.pipelines{{.*}}AEamdpal.version\92\03\00"}
+; SHADERTEST: [[META91:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
+; SHADERTEST: [[META92:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; SHADERTEST: [[META93:![0-9]+]] = !{i32 0, i32 0, i32 0, i32 0, i32 3, i32 3}
+; SHADERTEST: [[META94:![0-9]+]] = !{i32 2, i32 1, i32 1}
+; SHADERTEST: [[META95:![0-9]+]] = !{i32 3, i32 4, i32 1, i32 3}
+; SHADERTEST: [[META96:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.LSHS.main", scope: !78, file: !78, type: !97, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !79)
+; SHADERTEST: [[META97:![0-9]+]] = !DISubroutineType(types: !98)
+; SHADERTEST: [[META98:![0-9]+]] = !{}
+; SHADERTEST: [[META99:![0-9]+]] = !{i32 2}
+; SHADERTEST: [[DBG100]] = !DILocation(line: 0, scope: !96)
+; SHADERTEST: [[META101:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.ESGS.main", scope: !78, file: !78, type: !97, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !77)
+; SHADERTEST: [[META102:![0-9]+]] = !{i32 4}
+; SHADERTEST: [[DBG103]] = !DILocation(line: 0, scope: !101)
+; SHADERTEST: [[META104:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !1, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !0, templateParams: !98)
+; SHADERTEST: [[META105:![0-9]+]] = !DISubroutineType(flags: DIFlagPublic, types: !106)
+; SHADERTEST: [[META106:![0-9]+]] = !{null}
+; SHADERTEST: [[META107:![0-9]+]] = !{i32 0}
+; SHADERTEST: [[DBG108]] = !DILocation(line: 7, scope: !104)
+; SHADERTEST: [[META109:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !12, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !11, templateParams: !98)
+; SHADERTEST: [[META110:![0-9]+]] = !{i32 1}
+; SHADERTEST: [[DBG111]] = !DILocation(line: 6, scope: !109)
+; SHADERTEST: [[DBG112]] = !DILocation(line: 7, scope: !109)
+; SHADERTEST: [[DBG113]] = !DILocation(line: 8, scope: !109)
+; SHADERTEST: [[DBG114]] = !DILocation(line: 9, scope: !109)
+; SHADERTEST: [[DBG115]] = !DILocation(line: 11, scope: !109)
+; SHADERTEST: [[META116:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !45, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !44, templateParams: !98, retainedNodes: !117)
+; SHADERTEST: [[META117:![0-9]+]] = !{!118, !119, !120}
+; SHADERTEST: [[META118:![0-9]+]] = !DILocalVariable(name: "u", scope: !116, file: !45, line: 6, type: !6)
+; SHADERTEST: [[META119:![0-9]+]] = !DILocalVariable(name: "v", scope: !116, file: !45, line: 7, type: !6)
+; SHADERTEST: [[META120:![0-9]+]] = !DILocalVariable(name: "w", scope: !116, file: !45, line: 8, type: !6)
+; SHADERTEST: [[DBG121]] = !DILocation(line: 9, scope: !116)
+; SHADERTEST: [[META122:![0-9]+]] = !DILocation(line: 0, scope: !116)
+; SHADERTEST: [[META123:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !62, type: !105, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !61, templateParams: !98, retainedNodes: !124)
+; SHADERTEST: [[META124:![0-9]+]] = !{!125}
+; SHADERTEST: [[META125:![0-9]+]] = !DILocalVariable(name: "i", scope: !123, file: !62, line: 7, type: !126)
+; SHADERTEST: [[META126:![0-9]+]] = !DIBasicType(name: "uint", size: 32, encoding: DW_ATE_unsigned)
+; SHADERTEST: [[META127:![0-9]+]] = !{i32 3}
+; SHADERTEST: [[META128:![0-9]+]] = !DILocation(line: 0, scope: !123)
+; SHADERTEST: !{{[0-9]+}} = !DILocation(line: 10, scope: !123)
+; SHADERTEST: !{{[0-9]+}} = !DILocation(line: 12, scope: !123)
+; SHADERTEST: [[META131:![0-9]+]] = !{i32 8}
+; SHADERTEST: [[META132:![0-9]+]] = !{i32 6}
+;.
diff --git a/llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe b/llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe
new file mode 100644
index 0000000000..308d2fec90
--- /dev/null
+++ b/llpc/test/shaderdb/debug_info/PipelineGs_TestVsGSMergeShader.pipe
@@ -0,0 +1,435 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --check-globals
+; RUN: amdllpc --print-after=lgc-patch-prepare-pipeline-abi -trim-debug-info=false 2>&1 %s | FileCheck -check-prefix=SHADERTEST %s
+
+[Version]
+version = 53
+
+[VsSpirv]
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 43
+; Schema: 0
+               OpCapability Shader
+               OpExtension "SPV_KHR_non_semantic_info"
+          %2 = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
+          %3 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Vertex %main "main" %positionOut %position
+          %1 = OpString "test.vert"
+          %8 = OpString "uint"
+         %15 = OpString "main"
+         %18 = OpString "// OpModuleProcessed client vulkan100
+// OpModuleProcessed target-env vulkan1.0
+// OpModuleProcessed entry-point main
+#line 1
+#version 450
+layout(location = 0) in vec4 position;
+layout(location = 0) out vec4 positionOut;
+
+void main (void)
+{
+    positionOut = position;
+}"
+         %29 = OpString "float"
+         %36 = OpString "positionOut"
+         %41 = OpString "position"
+               OpName %main "main"
+               OpName %positionOut "positionOut"
+               OpName %position "position"
+               OpDecorate %positionOut Location 0
+               OpDecorate %position Location 0
+       %void = OpTypeVoid
+          %5 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+    %uint_32 = OpConstant %uint 32
+     %uint_6 = OpConstant %uint 6
+     %uint_0 = OpConstant %uint 0
+          %9 = OpExtInst %void %2 DebugTypeBasic %8 %uint_32 %uint_6 %uint_0
+     %uint_3 = OpConstant %uint 3
+          %6 = OpExtInst %void %2 DebugTypeFunction %uint_3 %void
+         %17 = OpExtInst %void %2 DebugSource %1 %18
+     %uint_1 = OpConstant %uint 1
+     %uint_4 = OpConstant %uint 4
+     %uint_2 = OpConstant %uint 2
+         %19 = OpExtInst %void %2 DebugCompilationUnit %uint_1 %uint_4 %17 %uint_2
+         %16 = OpExtInst %void %2 DebugFunction %15 %6 %17 %uint_0 %uint_0 %19 %15 %uint_3 %uint_0
+     %uint_7 = OpConstant %uint 7
+      %float = OpTypeFloat 32
+         %30 = OpExtInst %void %2 DebugTypeBasic %29 %uint_32 %uint_3 %uint_0
+    %v4float = OpTypeVector %float 4
+         %32 = OpExtInst %void %2 DebugTypeVector %30 %uint_4
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+%positionOut = OpVariable %_ptr_Output_v4float Output
+     %uint_8 = OpConstant %uint 8
+         %35 = OpExtInst %void %2 DebugGlobalVariable %36 %32 %17 %uint_7 %uint_0 %19 %36 %positionOut %uint_8
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+   %position = OpVariable %_ptr_Input_v4float Input
+         %40 = OpExtInst %void %2 DebugGlobalVariable %41 %32 %17 %uint_7 %uint_0 %19 %41 %position %uint_8
+               OpLine %1 5 16
+       %main = OpFunction %void None %5
+         %23 = OpLabel
+         %24 = OpExtInst %void %2 DebugFunctionDefinition %16 %main
+         %25 = OpExtInst %void %2 DebugScope %16
+         %26 = OpExtInst %void %2 DebugLine %17 %uint_7 %uint_7 %uint_0 %uint_0
+         %42 = OpLoad %v4float %position
+               OpStore %positionOut %42
+               OpReturn
+               OpFunctionEnd
+
+[VsInfo]
+entryPoint = main
+
+[GsSpirv]
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 86
+; Schema: 0
+               OpCapability Geometry
+               OpExtension "SPV_KHR_non_semantic_info"
+          %2 = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
+          %3 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Geometry %main "main" %_ %gl_in %position
+               OpExecutionMode %main InputPoints
+               OpExecutionMode %main Invocations 1
+               OpExecutionMode %main OutputPoints
+               OpExecutionMode %main OutputVertices 1
+          %1 = OpString "test.geom"
+          %8 = OpString "uint"
+         %15 = OpString "main"
+         %18 = OpString "// OpModuleProcessed client vulkan100
+// OpModuleProcessed target-env vulkan1.0
+// OpModuleProcessed entry-point main
+#line 1
+#version 450
+layout(points) in;
+layout(points, max_vertices = 1) out;
+layout(location = 0) in vec4 position[];
+
+void main (void)
+{
+    gl_Position = gl_in[0].gl_Position;
+    EmitVertex();
+    EndPrimitive();
+}"
+         %29 = OpString "float"
+         %37 = OpString "gl_Position"
+         %40 = OpString "gl_PointSize"
+         %43 = OpString "gl_CullDistance"
+         %47 = OpString "gl_PerVertex"
+         %51 = OpString ""
+         %53 = OpString "int"
+         %70 = OpString "gl_in"
+         %85 = OpString "position"
+               OpName %main "main"
+               OpName %gl_PerVertex "gl_PerVertex"
+               OpMemberName %gl_PerVertex 0 "gl_Position"
+               OpMemberName %gl_PerVertex 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex 3 "gl_CullDistance"
+               OpName %_ ""
+               OpName %gl_PerVertex_0 "gl_PerVertex"
+               OpMemberName %gl_PerVertex_0 0 "gl_Position"
+               OpMemberName %gl_PerVertex_0 1 "gl_PointSize"
+               OpMemberName %gl_PerVertex_0 2 "gl_ClipDistance"
+               OpMemberName %gl_PerVertex_0 3 "gl_CullDistance"
+               OpName %gl_in "gl_in"
+               OpName %position "position"
+               OpMemberDecorate %gl_PerVertex 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex Block
+               OpMemberDecorate %gl_PerVertex_0 0 BuiltIn Position
+               OpMemberDecorate %gl_PerVertex_0 1 BuiltIn PointSize
+               OpMemberDecorate %gl_PerVertex_0 2 BuiltIn ClipDistance
+               OpMemberDecorate %gl_PerVertex_0 3 BuiltIn CullDistance
+               OpDecorate %gl_PerVertex_0 Block
+               OpDecorate %position Location 0
+       %void = OpTypeVoid
+          %5 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+    %uint_32 = OpConstant %uint 32
+     %uint_6 = OpConstant %uint 6
+     %uint_0 = OpConstant %uint 0
+          %9 = OpExtInst %void %2 DebugTypeBasic %8 %uint_32 %uint_6 %uint_0
+     %uint_3 = OpConstant %uint 3
+          %6 = OpExtInst %void %2 DebugTypeFunction %uint_3 %void
+         %17 = OpExtInst %void %2 DebugSource %1 %18
+     %uint_1 = OpConstant %uint 1
+     %uint_4 = OpConstant %uint 4
+     %uint_2 = OpConstant %uint 2
+         %19 = OpExtInst %void %2 DebugCompilationUnit %uint_1 %uint_4 %17 %uint_2
+         %16 = OpExtInst %void %2 DebugFunction %15 %6 %17 %uint_0 %uint_0 %19 %15 %uint_3 %uint_0
+     %uint_8 = OpConstant %uint 8
+      %float = OpTypeFloat 32
+         %30 = OpExtInst %void %2 DebugTypeBasic %29 %uint_32 %uint_3 %uint_0
+    %v4float = OpTypeVector %float 4
+         %32 = OpExtInst %void %2 DebugTypeVector %30 %uint_4
+%_arr_float_uint_1 = OpTypeArray %float %uint_1
+         %34 = OpExtInst %void %2 DebugTypeArray %30 %uint_1
+%gl_PerVertex = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+   %uint_215 = OpConstant %uint 215
+         %36 = OpExtInst %void %2 DebugTypeMember %37 %32 %17 %uint_2 %uint_215 %uint_0 %uint_0 %uint_3
+   %uint_233 = OpConstant %uint 233
+         %39 = OpExtInst %void %2 DebugTypeMember %40 %30 %17 %uint_2 %uint_233 %uint_0 %uint_0 %uint_3
+     %uint_7 = OpConstant %uint 7
+         %42 = OpExtInst %void %2 DebugTypeMember %43 %34 %17 %uint_3 %uint_7 %uint_0 %uint_0 %uint_3
+         %45 = OpExtInst %void %2 DebugTypeMember %43 %34 %17 %uint_3 %uint_7 %uint_0 %uint_0 %uint_3
+         %46 = OpExtInst %void %2 DebugTypeComposite %47 %uint_1 %17 %uint_8 %uint_0 %19 %47 %uint_0 %uint_3 %36 %39 %42 %45
+%_ptr_Output_gl_PerVertex = OpTypePointer Output %gl_PerVertex
+          %_ = OpVariable %_ptr_Output_gl_PerVertex Output
+         %50 = OpExtInst %void %2 DebugGlobalVariable %51 %46 %17 %uint_8 %uint_0 %19 %51 %_ %uint_8
+        %int = OpTypeInt 32 1
+         %54 = OpExtInst %void %2 DebugTypeBasic %53 %uint_32 %uint_4 %uint_0
+      %int_0 = OpConstant %int 0
+%gl_PerVertex_0 = OpTypeStruct %v4float %float %_arr_float_uint_1 %_arr_float_uint_1
+    %uint_23 = OpConstant %uint 23
+         %57 = OpExtInst %void %2 DebugTypeMember %37 %32 %17 %uint_2 %uint_23 %uint_0 %uint_0 %uint_3
+    %uint_41 = OpConstant %uint 41
+         %59 = OpExtInst %void %2 DebugTypeMember %40 %30 %17 %uint_2 %uint_41 %uint_0 %uint_0 %uint_3
+    %uint_84 = OpConstant %uint 84
+         %61 = OpExtInst %void %2 DebugTypeMember %43 %34 %17 %uint_2 %uint_84 %uint_0 %uint_0 %uint_3
+         %63 = OpExtInst %void %2 DebugTypeMember %43 %34 %17 %uint_2 %uint_84 %uint_0 %uint_0 %uint_3
+         %64 = OpExtInst %void %2 DebugTypeComposite %47 %uint_1 %17 %uint_8 %uint_0 %19 %47 %uint_0 %uint_3 %57 %59 %61 %63
+%_arr_gl_PerVertex_0_uint_1 = OpTypeArray %gl_PerVertex_0 %uint_1
+         %66 = OpExtInst %void %2 DebugTypeArray %64 %uint_1
+%_ptr_Input__arr_gl_PerVertex_0_uint_1 = OpTypePointer Input %_arr_gl_PerVertex_0_uint_1
+      %gl_in = OpVariable %_ptr_Input__arr_gl_PerVertex_0_uint_1 Input
+         %69 = OpExtInst %void %2 DebugGlobalVariable %70 %66 %17 %uint_8 %uint_0 %19 %70 %gl_in %uint_8
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+     %uint_9 = OpConstant %uint 9
+    %uint_10 = OpConstant %uint 10
+%_arr_v4float_uint_1 = OpTypeArray %v4float %uint_1
+         %81 = OpExtInst %void %2 DebugTypeArray %32 %uint_1
+%_ptr_Input__arr_v4float_uint_1 = OpTypePointer Input %_arr_v4float_uint_1
+   %position = OpVariable %_ptr_Input__arr_v4float_uint_1 Input
+         %84 = OpExtInst %void %2 DebugGlobalVariable %85 %81 %17 %uint_10 %uint_0 %19 %85 %position %uint_8
+               OpLine %1 6 16
+       %main = OpFunction %void None %5
+         %23 = OpLabel
+         %24 = OpExtInst %void %2 DebugFunctionDefinition %16 %main
+         %25 = OpExtInst %void %2 DebugScope %16
+         %26 = OpExtInst %void %2 DebugLine %17 %uint_8 %uint_8 %uint_0 %uint_0
+         %72 = OpAccessChain %_ptr_Input_v4float %gl_in %int_0 %int_0
+         %73 = OpLoad %v4float %72
+         %75 = OpAccessChain %_ptr_Output_v4float %_ %int_0
+               OpStore %75 %73
+         %76 = OpExtInst %void %2 DebugLine %17 %uint_9 %uint_9 %uint_0 %uint_0
+               OpEmitVertex
+         %78 = OpExtInst %void %2 DebugLine %17 %uint_10 %uint_10 %uint_0 %uint_0
+               OpEndPrimitive
+               OpReturn
+               OpFunctionEnd
+
+[GsInfo]
+entryPoint = main
+
+
+[ResourceMapping]
+userDataNode[0].visibility = 82
+userDataNode[0].type = PushConst
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 13
+userDataNode[0].set = 0xFFFFFFFF
+userDataNode[0].binding = 0
+userDataNode[1].visibility = 32
+userDataNode[1].type = StreamOutTableVaPtr
+userDataNode[1].offsetInDwords = 13
+userDataNode[1].sizeInDwords = 1
+userDataNode[2].visibility = 2
+userDataNode[2].type = IndirectUserDataVaPtr
+userDataNode[2].offsetInDwords = 14
+userDataNode[2].sizeInDwords = 1
+userDataNode[2].indirectUserDataCount = 4
+
+[GraphicsPipelineState]
+topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST
+nggState.enableNgg = 1
+
+
+[VertexInputState]
+binding[0].binding = 0
+binding[0].stride = 16
+binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX
+attribute[0].location = 0
+attribute[0].binding = 0
+attribute[0].format = VK_FORMAT_R32G32B32A32_SFLOAT
+attribute[0].offset = 0
+;.
+; SHADERTEST: @[[LDS_GS:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(3) global [640 x i32], align 4
+;.
+; SHADERTEST-LABEL: define dllexport amdgpu_gs void @_amdgpu_gs_main(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.init.exec(i64 -1)
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; SHADERTEST-NEXT:    [[THREADIDINWAVE:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP15]])
+; SHADERTEST-NEXT:    [[ESVERTCOUNT:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO:%.*]], i32 0, i32 8)
+; SHADERTEST-NEXT:    [[GSPRIMCOUNT:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO]], i32 8, i32 8)
+; SHADERTEST-NEXT:    [[GSWAVEID:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO]], i32 16, i32 8)
+; SHADERTEST-NEXT:    [[WAVEINSUBGROUP:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[MERGEDWAVEINFO]], i32 24, i32 4)
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = mul i32 [[WAVEINSUBGROUP]], 320
+; SHADERTEST-NEXT:    [[VALIDESVERT:%.*]] = icmp ult i32 [[THREADIDINWAVE]], [[ESVERTCOUNT]]
+; SHADERTEST-NEXT:    br i1 [[VALIDESVERT]], label [[DOTBEGINES:%.*]], label [[DOTENDES:%.*]]
+; SHADERTEST:       .beginEs:
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP5:%.*]], i64 0
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP5]], i64 1
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP5]], i64 2
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP5]], i64 3
+; SHADERTEST-NEXT:    call amdgpu_es void @_amdgpu_es_main(i32 [[TMP17]], i32 [[TMP18]], i32 [[TMP19]], i32 [[TMP20]], i32 [[TMP16]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]]), !dbg [[DBG51:![0-9]+]]
+; SHADERTEST-NEXT:    br label [[DOTENDES]]
+; SHADERTEST:       .endEs:
+; SHADERTEST-NEXT:    fence syncscope("workgroup") release
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.barrier()
+; SHADERTEST-NEXT:    fence syncscope("workgroup") acquire
+; SHADERTEST-NEXT:    [[VALIDGSPRIM:%.*]] = icmp ult i32 [[THREADIDINWAVE]], [[GSPRIMCOUNT]]
+; SHADERTEST-NEXT:    br i1 [[VALIDGSPRIM]], label [[DOTBEGINGS:%.*]], label [[DOTENDGS:%.*]]
+; SHADERTEST:       .beginGs:
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[TMP6:%.*]], i32 0, i32 16)
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[TMP6]], i32 16, i32 16)
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 0, i32 16)
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 16, i32 16)
+; SHADERTEST-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 0, i32 16)
+; SHADERTEST-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 poison, i32 16, i32 16)
+; SHADERTEST-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP5]], i64 0
+; SHADERTEST-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP5]], i64 1
+; SHADERTEST-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP5]], i64 2
+; SHADERTEST-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP5]], i64 3
+; SHADERTEST-NEXT:    call amdgpu_gs void @_amdgpu_gs_main.1(i32 [[TMP27]], i32 [[TMP28]], i32 [[TMP29]], i32 [[TMP30]], i32 [[GSVSOFFSET:%.*]], i32 [[GSWAVEID]], i32 [[TMP21]], i32 [[TMP22]], i32 [[TMP8:%.*]], i32 [[TMP23]], i32 [[TMP24]], i32 [[TMP25]], i32 [[TMP26]], i32 [[TMP9:%.*]]), !dbg [[DBG51]]
+; SHADERTEST-NEXT:    br label [[DOTENDGS]]
+; SHADERTEST:       .endGs:
+; SHADERTEST-NEXT:    ret void
+;
+;
+; SHADERTEST-LABEL: define internal amdgpu_es void @_amdgpu_es_main(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG56:![0-9]+]]
+;
+;
+; SHADERTEST-LABEL: define internal amdgpu_gs void @_amdgpu_gs_main.1(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[ESGSOFFSET0:%.*]]
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 4
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(3) [[TMP4]], align 4
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 8
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = load float, ptr addrspace(3) [[TMP6]], align 4
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 12
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(3) [[TMP8]], align 4
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) @Lds.GS, i32 [[GSVSOFFSET:%.*]], !dbg [[DBG59:![0-9]+]]
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP10]], i32 1280, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    [[DOTIDX:%.*]] = mul i32 [[TMP1]], 20, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 [[DOTIDX]], !dbg [[DBG59]]
+; SHADERTEST-NEXT:    store float [[TMP3]], ptr addrspace(3) [[TMP12]], align 4, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 4, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    store float [[TMP5]], ptr addrspace(3) [[TMP13]], align 4, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 8, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    store float [[TMP7]], ptr addrspace(3) [[TMP14]], align 4, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 12, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    store i32 [[TMP9]], ptr addrspace(3) [[TMP15]], align 4, !dbg [[DBG59]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[GSWAVEID:%.*]]), !dbg [[DBG59]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 18, i32 [[GSWAVEID]]), !dbg [[DBG60:![0-9]+]]
+; SHADERTEST-NEXT:    fence syncscope("workgroup") release, !dbg [[DBG60]]
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[GSWAVEID]]), !dbg [[DBG60]]
+; SHADERTEST-NEXT:    ret void, !dbg [[DBG60]]
+;
+;
+; SHADERTEST-LABEL: define dllexport amdgpu_vs void @_amdgpu_vs_main(
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr addrspace(3) @Lds.GS, i32 [[VERTEXOFFSET:%.*]]
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP1]], i32 1280
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr addrspace(3) [[TMP2]], align 4, !invariant.load !49
+; SHADERTEST-NEXT:    [[DOTI3:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
+; SHADERTEST-NEXT:    [[DOTI2:%.*]] = extractelement <4 x float> [[TMP3]], i64 2
+; SHADERTEST-NEXT:    [[DOTI1:%.*]] = extractelement <4 x float> [[TMP3]], i64 1
+; SHADERTEST-NEXT:    [[DOTI0:%.*]] = extractelement <4 x float> [[TMP3]], i64 0
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float [[DOTI0]], float [[DOTI1]], float [[DOTI2]], float [[DOTI3]], i1 true, i1 false)
+; SHADERTEST-NEXT:    ret void
+;
+;
+; SHADERTEST-LABEL: @_amdgpu_ps_main(
+; SHADERTEST-NEXT:    ret void
+;
+;.
+; SHADERTEST: attributes #[[ATTR0:[0-9]+]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR1:[0-9]+]] = { alwaysinline nounwind memory(readwrite) "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR3:[0-9]+]] = { nounwind }
+; SHADERTEST: attributes #[[ATTR4:[0-9]+]] = { nounwind willreturn memory(read) }
+; SHADERTEST: attributes #[[ATTR5:[0-9]+]] = { "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR6:[0-9]+]] = { memory(readwrite) "InitialPSInputAddr"="0" "amdgpu-color-export"="0" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "target-features"=",+wavefrontsize64" }
+; SHADERTEST: attributes #[[ATTR7:[0-9]+]] = { nounwind memory(none) }
+; SHADERTEST: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
+; SHADERTEST: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; SHADERTEST: attributes #[[ATTR11:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; SHADERTEST: attributes #[[ATTR12:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; SHADERTEST: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !2)
+; SHADERTEST: [[META1:![0-9]+]] = !DIFile(filename: "test.vert", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 450\0Alayout(location = 0) in vec4 position
+; SHADERTEST: [[META2:![0-9]+]] = !{!3, !9}
+; SHADERTEST: [[META3:![0-9]+]] = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+; SHADERTEST: [[META4:![0-9]+]] = distinct !DIGlobalVariable(name: "positionOut", linkageName: "positionOut", scope: !0, file: !1, line: 7, type: !5, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META5:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 128, flags: DIFlagVector, elements: !7)
+; SHADERTEST: [[META6:![0-9]+]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+; SHADERTEST: [[META7:![0-9]+]] = !{!8}
+; SHADERTEST: [[META8:![0-9]+]] = !DISubrange(count: 4, lowerBound: 0)
+; SHADERTEST: [[META9:![0-9]+]] = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+; SHADERTEST: [[META10:![0-9]+]] = distinct !DIGlobalVariable(name: "position", linkageName: "position", scope: !0, file: !1, line: 7, type: !5, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META11:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !12, producer: "spirv", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !13)
+; SHADERTEST: [[META12:![0-9]+]] = !DIFile(filename: "test.geom", directory: ".", source: "// OpModuleProcessed client vulkan100\0A// OpModuleProcessed target-env vulkan1.0\0A// OpModuleProcessed entry-point main\0A#line 1\0A#version 450\0Alayout(points) in
+; SHADERTEST: [[META13:![0-9]+]] = !{!14, !24, !30}
+; SHADERTEST: [[META14:![0-9]+]] = !DIGlobalVariableExpression(var: !15, expr: !DIExpression())
+; SHADERTEST: [[META15:![0-9]+]] = distinct !DIGlobalVariable(scope: !11, file: !12, line: 8, type: !16, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META16:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !12, line: 8, flags: DIFlagPublic, elements: !17, identifier: "gl_PerVertex")
+; SHADERTEST: [[META17:![0-9]+]] = !{!18, !19, !20, !20}
+; SHADERTEST: [[META18:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_Position", file: !12, line: 2, baseType: !5, flags: DIFlagPublic)
+; SHADERTEST: [[META19:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_PointSize", file: !12, line: 2, baseType: !6, flags: DIFlagPublic)
+; SHADERTEST: [[META20:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_CullDistance", file: !12, line: 3, baseType: !21, flags: DIFlagPublic)
+; SHADERTEST: [[META21:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 32, elements: !22)
+; SHADERTEST: [[META22:![0-9]+]] = !{!23}
+; SHADERTEST: [[META23:![0-9]+]] = !DISubrange(count: 1, lowerBound: 0)
+; SHADERTEST: [[META24:![0-9]+]] = !DIGlobalVariableExpression(var: !25, expr: !DIExpression())
+; SHADERTEST: [[META25:![0-9]+]] = distinct !DIGlobalVariable(name: "gl_in", linkageName: "gl_in", scope: !11, file: !12, line: 8, type: !26, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META26:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !27, elements: !22)
+; SHADERTEST: [[META27:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "gl_PerVertex", file: !12, line: 8, flags: DIFlagPublic, elements: !28, identifier: "gl_PerVertex")
+; SHADERTEST: [[META28:![0-9]+]] = !{!18, !19, !29, !29}
+; SHADERTEST: [[META29:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "gl_CullDistance", file: !12, line: 2, baseType: !21, flags: DIFlagPublic)
+; SHADERTEST: [[META30:![0-9]+]] = !DIGlobalVariableExpression(var: !31, expr: !DIExpression())
+; SHADERTEST: [[META31:![0-9]+]] = distinct !DIGlobalVariable(name: "position", linkageName: "position", scope: !11, file: !12, line: 10, type: !32, isLocal: false, isDefinition: true)
+; SHADERTEST: [[META32:![0-9]+]] = !DICompositeType(tag: DW_TAG_array_type, baseType: !5, size: 128, elements: !22)
+; SHADERTEST: [[META33:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !34, producer: "lgc", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+; SHADERTEST: [[META34:![0-9]+]] = !DIFile(filename: "internal", directory: "")
+; SHADERTEST: [[META35:![0-9]+]] = !{!"Vulkan"}
+; SHADERTEST: [[META36:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 2, i32 1
+; SHADERTEST: [[META37:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+; SHADERTEST: [[META38:![0-9]+]] = !{i32 {{.*}}, i32 {{.*}}, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+; SHADERTEST: [[META39:![0-9]+]] = !{!"PushConst", i32 9, i32 82, i32 0, i32 13, i64 4294967295, i32 0, i32 4}
+; SHADERTEST: [[META40:![0-9]+]] = !{!"StreamOutTableVaPtr", i32 11, i32 32, i32 13, i32 1, i32 0}
+; SHADERTEST: [[META41:![0-9]+]] = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 14, i32 1, i32 4}
+; SHADERTEST: [[META42:![0-9]+]] = !{i32 0, i32 0, i32 0, i32 16, i32 14, i32 7}
+; SHADERTEST: [[META43:![0-9]+]] = !{!"\82\B0amdpal.pipelines{{.*}}AEamdpal.version\92\03\00"}
+; SHADERTEST: [[META44:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
+; SHADERTEST: [[META45:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; SHADERTEST: [[META46:![0-9]+]] = !{i32 0, i32 0, i32 1, i32 1}
+; SHADERTEST: [[META47:![0-9]+]] = distinct !DISubprogram(name: "lgc.shader.ESGS.main", scope: !34, file: !34, type: !48, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !33)
+; SHADERTEST: [[META48:![0-9]+]] = !DISubroutineType(types: !49)
+; SHADERTEST: [[META49:![0-9]+]] = !{}
+; SHADERTEST: [[META50:![0-9]+]] = !{i32 4}
+; SHADERTEST: [[DBG51]] = !DILocation(line: 0, scope: !47)
+; SHADERTEST: [[META52:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !1, type: !53, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !0, templateParams: !49)
+; SHADERTEST: [[META53:![0-9]+]] = !DISubroutineType(flags: DIFlagPublic, types: !54)
+; SHADERTEST: [[META54:![0-9]+]] = !{null}
+; SHADERTEST: [[META55:![0-9]+]] = !{i32 0}
+; SHADERTEST: [[DBG56]] = !DILocation(line: 7, scope: !52)
+; SHADERTEST: [[META57:![0-9]+]] = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !12, type: !53, flags: DIFlagPublic, spFlags: DISPFlagDefinition, unit: !11, templateParams: !49)
+; SHADERTEST: [[META58:![0-9]+]] = !{i32 3}
+; SHADERTEST: [[DBG59]] = !DILocation(line: 9, scope: !57)
+; SHADERTEST: [[DBG60]] = !DILocation(line: 10, scope: !57)
+; SHADERTEST: [[META61:![0-9]+]] = !{i32 8}
+; SHADERTEST: [[META62:![0-9]+]] = !{i32 6}
+;.
diff --git a/llpc/test/shaderdb/extensions/ExtFragMask_TestFragFetch_lit.frag b/llpc/test/shaderdb/extensions/ExtFragMask_TestFragFetch_lit.frag
index 9c2495457e..dbb43efc23 100644
--- a/llpc/test/shaderdb/extensions/ExtFragMask_TestFragFetch_lit.frag
+++ b/llpc/test/shaderdb/extensions/ExtFragMask_TestFragFetch_lit.frag
@@ -39,12 +39,12 @@ void main()
 ; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 1, i32 544,
 ; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 6, i32 544,
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16(i32 1, i16 2, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 2, i32 3, i32
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.3d.i32.i16(i32 1, i16 2, i16 3, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2darraymsaa.v4i32.i32(i32 15, i32 2, i32 3, i32 1, i32
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i32(i32 1, i32 %{{.*}}, i32 %{{.*}}, <8 x i32>
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2dmsaa.v4i32.i32(i32 15, i32
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16{{(\.v8i32)?}}(i32 1, i16 2, i16 3, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32{{(\.v8i32)?}}(i32 15, i32 2, i32 3, i32
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.3d.i32.i16{{(\.v8i32)?}}(i32 1, i16 2, i16 3, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2darraymsaa.v4i32.i32{{(\.v8i32)?}}(i32 15, i32 2, i32 3, i32 1, i32
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i32{{(\.v8i32)?}}(i32 1, i32 %{{.*}}, i32 %{{.*}}, <8 x i32>
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2dmsaa.v4i32.i32{{(\.v8i32)?}}(i32 15, i32
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/CoherentArray.frag b/llpc/test/shaderdb/general/CoherentArray.frag
new file mode 100644
index 0000000000..e00e85d17b
--- /dev/null
+++ b/llpc/test/shaderdb/general/CoherentArray.frag
@@ -0,0 +1,30 @@
+#version 450
+layout(set = 1, binding = 0) buffer coherent b
+{
+	vec4 v[3];
+};
+
+void main()
+{
+	v = vec4[3](vec4(42), vec4(42), vec4(42));
+}
+
+// BEGIN_SHADERTEST
+/*
+; RUN: amdllpc -v --verify-ir %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 0, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 4, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 8, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 12, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 16, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 20, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 24, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 28, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 32, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 36, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 40, i32 0, i32 1)
+; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1109917696, <4 x i32> %5, i32 44, i32 0, i32 1)
+; SHADERTEST: AMDLLPC SUCCESS
+*/
+// END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/CoherentVector.frag b/llpc/test/shaderdb/general/CoherentVector.frag
new file mode 100644
index 0000000000..204ccb280e
--- /dev/null
+++ b/llpc/test/shaderdb/general/CoherentVector.frag
@@ -0,0 +1,22 @@
+#version 450
+layout(set = 1, binding = 0) coherent buffer b
+{
+	vec4 v;
+};
+
+void main()
+{
+	v = vec4(42);
+}
+
+// BEGIN_SHADERTEST
+/*
+; RUN: amdllpc -v --verify-ir %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
+; SHADERTEST: store atomic float 4.200000e+01, ptr addrspace(7) @0 unordered, align 4
+; SHADERTEST: store atomic float 4.200000e+01, ptr addrspace(7) getelementptr ([4 x float], ptr addrspace(7) @0, i32 0, i32 1) unordered, align 4
+; SHADERTEST: store atomic float 4.200000e+01, ptr addrspace(7) getelementptr ([4 x float], ptr addrspace(7) @0, i32 0, i32 2) unordered, align 4
+; SHADERTEST: store atomic float 4.200000e+01, ptr addrspace(7) getelementptr ([4 x float], ptr addrspace(7) @0, i32 0, i32 3) unordered, align 4
+; SHADERTEST: AMDLLPC SUCCESS
+*/
+// END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/ImgDescLoad.comp b/llpc/test/shaderdb/general/ImgDescLoad.comp
index ce7aac0d8b..f0f462d1fb 100644
--- a/llpc/test/shaderdb/general/ImgDescLoad.comp
+++ b/llpc/test/shaderdb/general/ImgDescLoad.comp
@@ -7,7 +7,7 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
 ; SHADERTEST: [[IMG_DESC:%[0-9]*]] = load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 4, !invariant.load !12
 ; SHADERTEST: [[SMP_DESC:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 4, !invariant.load !12
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[IMG_DESC]], <4 x i32> [[SMP_DESC]], i1 false, i32 0, i32 0)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[IMG_DESC]], <4 x i32> [[SMP_DESC]], i1 false, i32 0, i32 0)
 */
 // END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/general/OptimizePointSizeWrite.pipe b/llpc/test/shaderdb/general/OptimizePointSizeWrite.pipe
new file mode 100644
index 0000000000..f0536cf8ac
--- /dev/null
+++ b/llpc/test/shaderdb/general/OptimizePointSizeWrite.pipe
@@ -0,0 +1,84 @@
+; This test is to verify the optimization of PointSize write. When the write value of PointSize is 1.0, we can remove
+; it safely in Vulkan. The optimization is controlled by the pipeline option optimizePointSizeWrite.
+
+; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+
+; SHADERTEST-LABEL: {{^// LLPC}} final pipeline module info
+; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i1 true, i1 false)
+
+; SHADERTEST-LABEL: {{^// LLPC}} final ELF info
+; SHADERTEST: v_mov_b32_e32 v0, 1.0
+; SHADERTEST-NEXT: exp pos0 v0, v0, v0, v0 done
+
+; SHADERTEST: .spi_shader_pos_format: [ 0x0000000000000004 0x0000000000000000 0x0000000000000000 0x0000000000000000 0x0000000000000000 ]
+
+[Version]
+version = 75
+
+[VsGlsl]
+#version 450 core
+
+void main() {
+  gl_Position = vec4(1.0);
+  gl_PointSize = 1.0;
+}
+
+[VsInfo]
+entryPoint = main
+
+[FsGlsl]
+#version 450 core
+
+layout(location = 0) out vec4 color;
+
+void main() {
+  color = vec4(1.0);
+}
+
+[FsInfo]
+entryPoint = main
+
+[ResourceMapping]
+userDataNode[0].visibility = 2
+userDataNode[0].type = IndirectUserDataVaPtr
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].indirectUserDataCount = 4
+userDataNode[1].visibility = 66
+userDataNode[1].type = DescriptorTableVaPtr
+userDataNode[1].offsetInDwords = 6
+userDataNode[1].sizeInDwords = 1
+userDataNode[1].next[0].type = DescriptorConstBufferCompact
+userDataNode[1].next[0].offsetInDwords = 0
+userDataNode[1].next[0].sizeInDwords = 2
+userDataNode[1].next[0].set = 0x0000005D
+userDataNode[1].next[0].binding = 17
+userDataNode[1].next[0].strideInDwords = 0
+userDataNode[1].next[1].type = DescriptorConstBuffer
+userDataNode[1].next[1].offsetInDwords = 2
+userDataNode[1].next[1].sizeInDwords = 8
+userDataNode[1].next[1].set = 0x0000005D
+userDataNode[1].next[1].binding = 0
+userDataNode[1].next[1].strideInDwords = 0
+userDataNode[1].next[2].type = DescriptorBuffer
+userDataNode[1].next[2].offsetInDwords = 10
+userDataNode[1].next[2].sizeInDwords = 8
+userDataNode[1].next[2].set = 0x0000005D
+userDataNode[1].next[2].binding = 1
+userDataNode[1].next[2].strideInDwords = 0
+userDataNode[2].visibility = 4
+userDataNode[2].type = StreamOutTableVaPtr
+userDataNode[2].offsetInDwords = 2
+userDataNode[2].sizeInDwords = 1
+
+[GraphicsPipelineState]
+topology = VK_PRIMITIVE_TOPOLOGY_LINE_STRIP
+provokingVertexMode = VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT
+depthClipEnable = 1
+numSamples = 1
+colorBuffer[0].format = VK_FORMAT_R8G8B8A8_UNORM
+colorBuffer[0].channelWriteMask = 15
+nggState.enableNgg = 1
+options.enableImplicitInvariantExports = 1
+options.optimizationLevel = 2
+options.optimizePointSizeWrite = 1
diff --git a/llpc/test/shaderdb/general/PipelineCs_TestFetch2DMSFmaskBased_lit.pipe b/llpc/test/shaderdb/general/PipelineCs_TestFetch2DMSFmaskBased_lit.pipe
index d3668cd458..a432a873c9 100644
--- a/llpc/test/shaderdb/general/PipelineCs_TestFetch2DMSFmaskBased_lit.pipe
+++ b/llpc/test/shaderdb/general/PipelineCs_TestFetch2DMSFmaskBased_lit.pipe
@@ -10,8 +10,8 @@
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.with.fmask.v4f32(i32 6, i32 1536, {{.*}}, {{.*}}, <2 x i32> <i32 0, i32 1>, i32 4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16(i32 1, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
-; SHADERTEST: %{{.*}} = call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 0, i32 1, i32 %{{.*}}, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i16{{(\.v8i32)?}}(i32 1, i16 0, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0), !invariant.load !{{.*}}
+; SHADERTEST: %{{.*}} = call {{.*}} <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32{{(\.v8i32)?}}(i32 15, i32 0, i32 1, i32 %{{.*}}, <8 x i32> %{{.*}}, i32 0, i32 0)
 ; SHADERTEST: AMDLLPC SUCCESS
 ; END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_ColorExportShader.pipe b/llpc/test/shaderdb/general/PipelineVsFs_ColorExportShader.pipe
new file mode 100644
index 0000000000..0b3de54a08
--- /dev/null
+++ b/llpc/test/shaderdb/general/PipelineVsFs_ColorExportShader.pipe
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
+; RUN: amdllpc -filetype=asm -o - %s | FileCheck -check-prefix=CHECK %s
+
+; This check is supposed to output all shader stages, not just the last one.
+
+[Version]
+version = 72
+
+[VsGlsl]
+#version 450
+
+layout(location = 0) in vec4 inPos;
+
+void main() {
+  gl_Position = inPos;
+}
+
+[VsInfo]
+entryPoint = main
+
+[FsGlsl]
+#version 450
+
+layout(location = 0) out vec4 outColor;
+
+void main() {
+  outColor = vec4(1.0, 0.0, 1.0, 1.0);
+}
+
+[FsInfo]
+entryPoint = main
+
+[GraphicsPipelineState]
+colorBuffer[0].format = VK_FORMAT_R32G32B32A32_SFLOAT
+colorBuffer[0].channelWriteMask = 15
+colorBuffer[0].blendEnable = 0
+enableColorExportShader = 1
+
+; CHECK:     amdgpu_vs_main:
+; CHECK:     amdgpu_ps_main:
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe
index bdbb83a8e3..fd4a2d1dd5 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe
@@ -46,6 +46,7 @@ entryPoint = main
 [GraphicsPipelineState]
 colorBuffer[0].format = VK_FORMAT_B8G8R8A8_UNORM
 colorBuffer[0].channelWriteMask = 15
+options.enableImplicitInvariantExports = 1
 
 [VertexInputState]
 binding[0].binding = 0
diff --git a/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp b/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp
index 4a05fb20cb..e02065bd35 100644
--- a/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp
+++ b/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp
@@ -32,7 +32,7 @@ void main()
 // CHECK: attributes #[[ATTR0]] = { alwaysinline nounwind memory(readwrite) "amdgpu-flat-work-group-size"="256,256" "amdgpu-memory-bound"="false" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="3" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 // CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind willreturn memory(none) }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = {{{.*}} nounwind willreturn memory(none) }
 // CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 16, i32 16, i32 1}
diff --git a/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe b/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe
index 243b999907..031e3ede4a 100644
--- a/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe
+++ b/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe
@@ -2,12 +2,12 @@
 
 ; RUN: amdllpc %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: @_amdgpu_gs_main(
-; SHADERTEST: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 0, i32 %{{.*}}, i32 3)
+; SHADERTEST: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 0, i32 %{{.*}}, i32 1)
 ; SHADERTEST: fence syncscope("agent") release
 ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i1 false, i1 false)
 ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 13, i32 1, float 1.000000e+00, float poison, float poison, float poison, i1 true, i1 false)
 ; SHADERTEST-LABEL: _amdgpu_gs_main:
-; SHADERTEST: buffer_store_b128 {{v[[0-9]*:[0-9]*]}}, {{v[0-9]*}}, {{s[[0-9]*:[0-9]*]}}, {{s[0-9]*}} idxen glc slc
+; SHADERTEST: buffer_store_b128 {{v[[0-9]*:[0-9]*]}}, {{v[0-9]*}}, {{s[[0-9]*:[0-9]*]}}, {{s[0-9]*}} idxen glc
 ; SHADERTEST: s_waitcnt_vscnt null, 0x0
 ; SHADERTEST: exp pos0 {{v[0-9]*}}, {{v[0-9]*}}, {{v[0-9]*}}, {{v[0-9]*}}
 ; SHADERTEST: exp pos1 {{v[0-9]*}}, off, off, off done
diff --git a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
index c34b187492..af434be2e9 100644
--- a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
+++ b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
@@ -341,7 +341,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .vgpr_count:     0x9
 ; CHECK-NEXT:         .vgpr_limit:     0x100
 ; CHECK-NEXT:         .wavefront_size: 0x40
-; CHECK-NEXT:         .wgp_mode:       false
+; CHECK-NEXT:         .wgp_mode:       true
 ; CHECK-NEXT:       .ps:
 ; CHECK-NEXT:         .checksum_value: 0x2cbaf88c
 ; CHECK-NEXT:         .debug_mode:     false
@@ -349,7 +349,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
 ; CHECK-NEXT:         .image_op:       true
-; CHECK:         .mem_ordered:    true
+; CHECK:              .mem_ordered:    true
 ; CHECK-NEXT:         .scratch_en:     false
 ; CHECK-NEXT:         .scratch_memory_size: 0
 ; CHECK-NEXT:         .sgpr_count:     0x11
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
index 6aad9398cd..a8ae335582 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
@@ -34,28 +34,29 @@ void main() {
 // CHECK-NEXT:  .entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
-// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP1]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP1]], i32 512
-// CHECK-NEXT:    [[LOAD2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 2
-// CHECK-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP10:%.*]]
-// CHECK:       5:
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP3]], 1
-// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
-// CHECK-NEXT:    [[TMP8:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[TMP7]], i32 3, i32 1, i32 0)
-// CHECK-NEXT:    [[TMP9:%.*]] = fptoui half [[TMP8]] to i32
-// CHECK-NEXT:    br label [[TMP10]]
-// CHECK:       10:
-// CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP9]], [[TMP5]] ]
-// CHECK-NEXT:    store i32 [[TMP11]], ptr addrspace(7) [[TMP1]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(4) [[TMP0]], i64 4
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(4) [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i32 [[TMP13]], 2
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 1
-// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
-// CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP14]], <8 x float> [[TMP16]], <8 x float> zeroinitializer
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP1]], i32 1024
-// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP18]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP17]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 0)
+// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 32)
+// CHECK-NEXT:    [[LOAD2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP3]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 2
+// CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP11:%.*]]
+// CHECK:       6:
+// CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
+// CHECK-NEXT:    [[TMP9:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[TMP8]], i32 3, i32 1, i32 0)
+// CHECK-NEXT:    [[TMP10:%.*]] = fptoui half [[TMP9]] to i32
+// CHECK-NEXT:    br label [[TMP11]]
+// CHECK:       11:
+// CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP6]] ]
+// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(7) [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i64 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(4) [[TMP13]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], 2
+// CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 1
+// CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
+// CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP15]], <8 x float> [[TMP17]], <8 x float> zeroinitializer
+// CHECK-NEXT:    [[TMP19:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 64)
+// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP19]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP18]])
 // CHECK-NEXT:    ret void
 //
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm b/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
index bd0816b5e8..bd9bae7354 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
@@ -123,25 +123,28 @@
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP1]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
-; CHECK-NEXT:    [[LOAD1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP0]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
-; CHECK-NEXT:    br label [[TMP2:%.*]]
-; CHECK:       2:
-; CHECK-NEXT:    [[DOT012:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP10:%.*]], [[TMP5:%.*]] ]
-; CHECK-NEXT:    [[DOT0:%.*]] = phi <8 x float> [ undef, [[DOTENTRY]] ], [ [[TMP9:%.*]], [[TMP5]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.cooperative.matrix.length(i32 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[DOT012]], [[TMP3]]
-; CHECK-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP4]]
-; CHECK-NEXT:    br i1 [[COND_FREEZE]], label [[TMP5]], label [[TMP11:%.*]]
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[LOAD]], i32 [[DOT012]], i32 1, i32 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[LOAD1]], i32 [[DOT012]], i32 1, i32 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul reassoc nnan nsz arcp contract afn half [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9]] = call <8 x float> (...) @lgc.cooperative.matrix.insert__v8f32(<8 x float> [[DOT0]], half [[TMP8]], i32 [[DOT012]], i32 1, i32 0)
-; CHECK-NEXT:    [[TMP10]] = add i32 [[DOT012]], 1
-; CHECK-NEXT:    br label [[TMP2]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 2, i32 0, i32 2)
-; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP12]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[DOT0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 0)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+; CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 16, i32 0)
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP3]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+; CHECK-NEXT:    br label [[TMP4:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[DOT012:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP12:%.*]], [[TMP7:%.*]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi <8 x float> [ undef, [[DOTENTRY]] ], [ [[TMP11:%.*]], [[TMP7]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @lgc.cooperative.matrix.length(i32 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[DOT012]], [[TMP5]]
+; CHECK-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP6]]
+; CHECK-NEXT:    br i1 [[COND_FREEZE]], label [[TMP7]], label [[TMP13:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[LOAD]], i32 [[DOT012]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[LOAD1]], i32 [[DOT012]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul reassoc nnan nsz arcp contract afn half [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11]] = call <8 x float> (...) @lgc.cooperative.matrix.insert__v8f32(<8 x float> [[DOT0]], half [[TMP10]], i32 [[DOT012]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP12]] = add i32 [[DOT012]], 1
+; CHECK-NEXT:    br label [[TMP4]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 2, i32 0, i32 2)
+; CHECK-NEXT:    [[TMP15:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP14]], i32 16, i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP15]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[DOT0]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp b/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
index 7b5e948de7..a41a7fee79 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
@@ -24,7 +24,9 @@ void main() {
 // CHECK-NEXT:  .entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
-// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP1]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16)
-// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP0]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[LOAD]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP1]], i32 16, i32 0)
+// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 16, i32 0)
+// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP3]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[LOAD]])
 // CHECK-NEXT:    ret void
 //
diff --git a/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe b/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe
index 323b350d11..2c6c9de718 100644
--- a/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe
+++ b/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe
@@ -25,7 +25,7 @@ colorExport=PipelineLibCes_TestColorExport.pipe
 ; SHADERTEST-NEXT:    [[DOT0:%.*]] = load float, ptr addrspace(7) [[DOT0_IN]], align 4
 ; SHADERTEST-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP2]], float 1.000000e+00, i64 3
 ; SHADERTEST-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[DOT0]], i64 2
-; SHADERTEST-NEXT:    call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[TMP8]]) #[[ATTR4:[0-9]+]]
+; SHADERTEST-NEXT:    call void @lgc.output.export.builtin.Position.i32.v4f32(i32 0, <4 x float> [[TMP8]])
 ; SHADERTEST-NEXT:    ret void
 ;
 ;
@@ -34,7 +34,7 @@ colorExport=PipelineLibCes_TestColorExport.pipe
 ; SHADERTEST-NEXT:  .entry:
 ; SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 1, i32 1, i32 0, i32 0)
 ; SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
-; SHADERTEST-NEXT:    [[FRAGCOORD:%.*]] = call <4 x float> @lgc.input.import.builtin.FragCoord.v4f32.i32(i32 15) #[[ATTR4]]
+; SHADERTEST-NEXT:    [[FRAGCOORD:%.*]] = call <4 x float> @lgc.input.import.builtin.FragCoord.v4f32.i32(i32 15)
 ; SHADERTEST-NEXT:    [[__LLPC_INPUT_PROXY_GL_FRAGCOORD_4_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[FRAGCOORD]], i64 1
 ; SHADERTEST-NEXT:    [[TMP2:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[__LLPC_INPUT_PROXY_GL_FRAGCOORD_4_VEC_EXTRACT]], -5.000000e-01
 ; SHADERTEST-NEXT:    [[TMP3:%.*]] = fptosi float [[TMP2]] to i32
diff --git a/llpc/test/shaderdb/hlsl/Hlsl_TestStructuredBuffers.spvasm b/llpc/test/shaderdb/hlsl/Hlsl_TestStructuredBuffers.spvasm
new file mode 100644
index 0000000000..4596e98b2d
--- /dev/null
+++ b/llpc/test/shaderdb/hlsl/Hlsl_TestStructuredBuffers.spvasm
@@ -0,0 +1,93 @@
+; BEGIN_SHADERTEST
+; RUN: amdllpc -v %gfxip %s -validate-spirv=false | FileCheck -check-prefix=SHADERTEST %s
+; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
+; SHADERTEST: call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[descriptor:%[0-9]+]], i32 [[index:%[0-9]+]], i32 32, i32 0, i32 0)
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> [[descriptor]], i32 [[index]], i32 [[sink_idx:%.*]], i32 0, i32 0)
+; SHADERTEST: call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 [[index]], i32 0, i32 0, i32 0)
+
+; SHADERTEST-NOT: mul i32 {{%[0-9]+}}, 48
+; SHADERTEST-NOT: add i32 {{%.*}}, 32
+; SHADERTEST-NOT: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> {{%[0-9]+}}, i32 {{%.*}}, i32 0), !invariant.load !!{{[0-9]+}}
+; SHADERTEST-NOT: call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> [[descriptor]], i32 {{%.*}}, i32 0), !invariant.load !!{{[0-9]+}}
+; SHADERTEST-NOT: shl i32 [[index]], 4
+; SHADERTEST-NOT: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> {{%[0-9]+}}, <4 x i32> [[descriptor]], i32 {{%[0-9]+}}, i32 0, i32 0)
+; END_SHADERTEST
+
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID %Output %Input
+               OpExecutionMode %main LocalSize 64 1 1
+               OpSource HLSL 660
+               OpName %type_RWStructuredBuffer_v4float "type.RWStructuredBuffer.v4float"
+               OpName %Output "Output"
+               OpName %type_StructuredBuffer_MyStruct "type.StructuredBuffer.MyStruct"
+               OpName %MyStruct "MyStruct"
+               OpMemberName %MyStruct 0 "Color"
+               OpMemberName %MyStruct 1 "Normal"
+               OpMemberName %MyStruct 2 "showColor"
+               OpName %Input "Input"
+               OpName %main "main"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %Output DescriptorSet 0
+               OpDecorate %Output Binding 0
+               OpDecorate %Input DescriptorSet 0
+               OpDecorate %Input Binding 0
+               OpDecorate %_runtimearr_v4float ArrayStride 16
+               OpMemberDecorate %type_RWStructuredBuffer_v4float 0 Offset 0
+               OpDecorate %type_RWStructuredBuffer_v4float Block
+               OpMemberDecorate %MyStruct 0 Offset 0
+               OpMemberDecorate %MyStruct 1 Offset 16
+               OpMemberDecorate %MyStruct 2 Offset 32
+               OpDecorate %_runtimearr_MyStruct ArrayStride 48
+               OpMemberDecorate %type_StructuredBuffer_MyStruct 0 Offset 0
+               OpMemberDecorate %type_StructuredBuffer_MyStruct 0 NonWritable
+               OpDecorate %type_StructuredBuffer_MyStruct Block
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+      %int_2 = OpConstant %int 2
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+      %int_1 = OpConstant %int 1
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+%_runtimearr_v4float = OpTypeRuntimeArray %v4float
+%type_RWStructuredBuffer_v4float = OpTypeStruct %_runtimearr_v4float
+%_ptr_StorageBuffer_type_RWStructuredBuffer_v4float = OpTypePointer StorageBuffer %type_RWStructuredBuffer_v4float
+   %MyStruct = OpTypeStruct %v4float %v4float %uint
+%_runtimearr_MyStruct = OpTypeRuntimeArray %MyStruct
+%type_StructuredBuffer_MyStruct = OpTypeStruct %_runtimearr_MyStruct
+%_ptr_StorageBuffer_type_StructuredBuffer_MyStruct = OpTypePointer StorageBuffer %type_StructuredBuffer_MyStruct
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+         %23 = OpTypeFunction %void
+%_ptr_StorageBuffer_uint = OpTypePointer StorageBuffer %uint
+       %bool = OpTypeBool
+%_ptr_StorageBuffer_v4float = OpTypePointer StorageBuffer %v4float
+     %Output = OpVariable %_ptr_StorageBuffer_type_RWStructuredBuffer_v4float StorageBuffer
+      %Input = OpVariable %_ptr_StorageBuffer_type_StructuredBuffer_MyStruct StorageBuffer
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+       %main = OpFunction %void None %23
+         %27 = OpLabel
+         %28 = OpLoad %v3uint %gl_GlobalInvocationID
+         %29 = OpCompositeExtract %uint %28 0
+         %30 = OpAccessChain %_ptr_StorageBuffer_uint %Input %int_0 %29 %int_2
+         %31 = OpLoad %uint %30
+         %32 = OpINotEqual %bool %31 %uint_0
+               OpSelectionMerge %33 None
+               OpBranchConditional %32 %34 %35
+         %34 = OpLabel
+         %36 = OpAccessChain %_ptr_StorageBuffer_v4float %Input %int_0 %29 %int_0
+         %37 = OpLoad %v4float %36
+         %38 = OpAccessChain %_ptr_StorageBuffer_v4float %Output %int_0 %29
+               OpStore %38 %37
+               OpBranch %33
+         %35 = OpLabel
+         %39 = OpAccessChain %_ptr_StorageBuffer_v4float %Input %int_0 %29 %int_1
+         %40 = OpLoad %v4float %39
+         %41 = OpAccessChain %_ptr_StorageBuffer_v4float %Output %int_0 %29
+               OpStore %41 %40
+               OpBranch %33
+         %33 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag b/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag
index dda5e3a2bf..2c030c2c71 100644
--- a/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag
+++ b/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag
@@ -39,9 +39,9 @@ void main()
 ; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 6, i32 544, ptr addrspace(4)
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.3d.i32.i16(i32 1, i16 2, i16 3, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
-; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2darraymsaa.v4i32.i32(i32 15, i32 2, i32 3, i32 1, i32 %{{[-0-9A-Za0z_.]+}}, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
-; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i32(i32 1,
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.3d.i32.i16{{(\.v8i32)?}}(i32 1, i16 2, i16 3, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.image.load.2darraymsaa.v4i32.i32{{(\.v8i32)?}}(i32 15, i32 2, i32 3, i32 1, i32 %{{[-0-9A-Za0z_.]+}}, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 0, i32 0)
+; SHADERTEST: call i32 @llvm.amdgcn.image.load.2d.i32.i32{{(\.v8i32)?}}(i32 1,
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/object/ObjInput_TestGsBuiltIn_lit.geom b/llpc/test/shaderdb/object/ObjInput_TestGsBuiltIn_lit.geom
index 5d93f289ba..29438a3b8b 100644
--- a/llpc/test/shaderdb/object/ObjInput_TestGsBuiltIn_lit.geom
+++ b/llpc/test/shaderdb/object/ObjInput_TestGsBuiltIn_lit.geom
@@ -26,20 +26,20 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call i32 @lgc.input.import.builtin.InvocationId{{.*}}
-; SHADERTEST: call i32 @lgc.input.import.builtin.PrimitiveId{{.*}}
-; SHADERTEST: call <4 x float> @lgc.input.import.builtin.Position.v4f32{{.*}}
-; SHADERTEST: call float @lgc.input.import.builtin.PointSize.f32{{.*}}
-; SHADERTEST: call [3 x float] @lgc.input.import.builtin.ClipDistance.a3f32{{.*}}
 ; SHADERTEST: call [3 x float] @lgc.input.import.builtin.CullDistance.a3f32{{.*}}
-; SHADERTEST: call <4 x float> @lgc.input.import.builtin.Position.v4f32{{.*}}
-; SHADERTEST: call float @lgc.input.import.builtin.PointSize.f32{{.*}}
-; SHADERTEST: call [3 x float] @lgc.input.import.builtin.ClipDistance.a3f32{{.*}}
 ; SHADERTEST: call [3 x float] @lgc.input.import.builtin.CullDistance.a3f32{{.*}}
-; SHADERTEST: call <4 x float> @lgc.input.import.builtin.Position.v4f32{{.*}}
-; SHADERTEST: call float @lgc.input.import.builtin.PointSize.f32{{.*}}
-; SHADERTEST: call [3 x float] @lgc.input.import.builtin.ClipDistance.a3f32{{.*}}
 ; SHADERTEST: call [3 x float] @lgc.input.import.builtin.CullDistance.a3f32{{.*}}
+; SHADERTEST: call [3 x float] @lgc.input.import.builtin.ClipDistance.a3f32{{.*}}
+; SHADERTEST: call [3 x float] @lgc.input.import.builtin.ClipDistance.a3f32{{.*}}
+; SHADERTEST: call [3 x float] @lgc.input.import.builtin.ClipDistance.a3f32{{.*}}
+; SHADERTEST: call float @lgc.input.import.builtin.PointSize.f32{{.*}}
+; SHADERTEST: call float @lgc.input.import.builtin.PointSize.f32{{.*}}
+; SHADERTEST: call float @lgc.input.import.builtin.PointSize.f32{{.*}}
+; SHADERTEST: call <4 x float> @lgc.input.import.builtin.Position.v4f32{{.*}}
+; SHADERTEST: call <4 x float> @lgc.input.import.builtin.Position.v4f32{{.*}}
+; SHADERTEST: call <4 x float> @lgc.input.import.builtin.Position.v4f32{{.*}}
+; SHADERTEST: call i32 @lgc.input.import.builtin.InvocationId{{.*}}
+; SHADERTEST: call i32 @lgc.input.import.builtin.PrimitiveId{{.*}}
 ; SHADERTEST: AMDLLPC SUCCESS
 */
-// END_SHADERTEST
+// END_SHADERTEST
\ No newline at end of file
diff --git a/llpc/test/shaderdb/object/ObjNonUniform_TestImageSample.frag b/llpc/test/shaderdb/object/ObjNonUniform_TestImageSample.frag
index b288e23122..09afc6ec30 100644
--- a/llpc/test/shaderdb/object/ObjNonUniform_TestImageSample.frag
+++ b/llpc/test/shaderdb/object/ObjNonUniform_TestImageSample.frag
@@ -34,11 +34,10 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 1, i32 896,
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 1, i32 536,
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
-; SHADERTEST-COUNT-12: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32
+; SHADERTEST: call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: {{%[0-9]*}} = call float @llvm.amdgcn.interp.mov
-; SHADERTEST: {{%[0-9]*}} = bitcast float {{%[0-9]*}} to i32
-; SHADERTEST: {{%[0-9]*}} = call i32 @llvm.amdgcn.readfirstlane{{(.i32)?}}(i32 {{%[0-9]*}})
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag b/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag
index 92b381dded..ec7abe4d3c 100644
--- a/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag
+++ b/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag
@@ -17,7 +17,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.sample.f32(i32 1, i32 512, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.lz.2d.f32.f16(i32 1, float 0.000000e+00, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
+; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.lz.2d.f32.f16{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 1, float 0.000000e+00, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag b/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag
index 5c24d48069..7c808639de 100644
--- a/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag
+++ b/llpc/test/shaderdb/object/ObjStorageBlock_TestRowMajor_lit.frag
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
 #version 450
 
 layout(std430, row_major, set = 0, binding = 0) buffer BufferObject
@@ -15,14 +16,45 @@ void main()
 
 // BEGIN_SHADERTEST
 /*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; RUN: amdllpc -v -gfxip=11 %s | FileCheck -check-prefix=SHADERTEST %s
 
-; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> {{%[^,]+}}, i32 0, i32 0, i32 0)
-; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> {{%[^,]+}}, i32 16, i32 0, i32 0)
-; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> {{%[^,]+}}, i32 32, i32 0, i32 0)
-; SHADERTEST: call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> {{%[^,]+}}, i32 48, i32 0, i32 0)
-
-; SHADERTEST: AMDLLPC SUCCESS
+; SHADERTEST-LABEL: {{^}}// LLPC SPIRV-to-LLVM translation results
 */
 // END_SHADERTEST
+// SHADERTEST-LABEL: @lgc.shader.FS.main(
+// SHADERTEST-NEXT:  .entry:
+// SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
+// SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP0]], align 4
+// SHADERTEST-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 16
+// SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP1]], align 4
+// SHADERTEST-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 32
+// SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP2]], align 4
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 48
+// SHADERTEST-NEXT:    store float 1.000000e+00, ptr addrspace(7) [[TMP3]], align 4
+// SHADERTEST-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(7) [[TMP0]], align 16
+// SHADERTEST-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i64 0
+// SHADERTEST-NEXT:    [[TMP6:%.*]] = load float, ptr addrspace(7) [[TMP1]], align 16
+// SHADERTEST-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP6]], i64 1
+// SHADERTEST-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(7) [[TMP2]], align 16
+// SHADERTEST-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP8]], i64 2
+// SHADERTEST-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float 1.000000e+00, i64 3
+// SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP10]]) #[[ATTR2:[0-9]+]]
+// SHADERTEST-NEXT:    ret void
+//
+//
+// SHADERTEST-LABEL: @_amdgpu_ps_main(
+// SHADERTEST-NEXT:  .entry:
+// SHADERTEST-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+// SHADERTEST-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], -4294967296
+// SHADERTEST-NEXT:    [[TMP2:%.*]] = zext i32 [[USERDATA0:%.*]] to i64
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP1]], [[TMP2]]
+// SHADERTEST-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4)
+// SHADERTEST-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP4]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP4]], i32 -1) ]
+// SHADERTEST-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP4]], align 16
+// SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> [[TMP5]], i32 0, i32 0, i32 0)
+// SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> [[TMP5]], i32 16, i32 0, i32 0)
+// SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> [[TMP5]], i32 32, i32 0, i32 0)
+// SHADERTEST-NEXT:    call void @llvm.amdgcn.raw.buffer.store.i32(i32 1065353216, <4 x i32> [[TMP5]], i32 48, i32 0, i32 0)
+// SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i1 true, i1 true)
+// SHADERTEST-NEXT:    ret void
+//
diff --git a/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert b/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert
index 45c23365de..4776e6fad3 100644
--- a/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert
+++ b/llpc/test/shaderdb/object/ObjStorageBlock_TestRuntimeArray_lit.vert
@@ -16,7 +16,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: getelementptr <{ [4 x float], [4294967295 x [4 x float]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 1, i32 %{{[0-9]*}}
+; SHADERTEST: getelementptr (<{ [4 x float], [4294967295 x [4 x float]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 1)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: store <4 x float>
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe
index d1e1a03404..752d15aa95 100644
--- a/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRayquery.pipe
@@ -258,12 +258,12 @@ rtState.rtIpOverride = 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_4_VEC_EXTRACT]]
 ; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT:%.*]] = extractelement <3 x i32> [[TMP5]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr <{ [4294967295 x <{ [3 x float], float, [3 x float], float }>] }>, ptr addrspace(7) [[TMP3]], i32 0, i32 0, i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP10]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP3]], i32 32, i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP10]], i32 12
 ; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(7) [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP10]], i32 16
-; CHECK-NEXT:    [[TMP15:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP14]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP10]], i32 28
 ; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr addrspace(7) [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = call ptr addrspace(5) (...) @lgc.rtq.gep.opaque([3 x i127] poison, i1 false, ptr addrspace(5) [[__LLPC_GLOBAL_PROXY_Q]], i32 0, i32 2)
@@ -287,7 +287,7 @@ rtState.rtIpOverride = 0
 ; CHECK-NEXT:    br i1 [[COND]], label [[TMP29:%.*]], label [[COMMON_RET]]
 ; CHECK:       29:
 ; CHECK-NEXT:    [[TMP30:%.*]] = call reassoc nnan nsz arcp contract afn <2 x float> (...) @lgc.rtq.intersection.barycentrics(ptr addrspace(5) [[TMP27]], i1 true)
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr <{ [4294967295 x [2 x float]] }>, ptr addrspace(7) [[TMP0]], i32 0, i32 0, i32 [[TMP9]]
-; CHECK-NEXT:    store <2 x float> [[TMP30]], ptr addrspace(7) [[TMP31]], align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = call ptr addrspace(7) @lgc.buffer.index(ptr addrspace(7) [[TMP0]], i32 8, i32 [[TMP9]])
+; CHECK-NEXT:    store <2 x float> [[TMP30]], ptr addrspace(7) [[TMP31]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ;
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe
new file mode 100644
index 0000000000..cc50265c6d
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders.pipe
@@ -0,0 +1,285 @@
+; Check that driver shader specialization (SDS) works on LLPC raytracing pipelines.
+;
+; This test consists of two files:
+;  * ./PipelineRays_Continuations_SpecializeDriverShaders.pipe
+;    This file defines the actual pipeline, and uses debug output to check analysis details.
+;    Thus, it only runs with assertions enabled.
+;  * ./PipelineRays_Continuations_SpecializeDriverShaders_Isa.pipe
+;    This file uses the above pipeline, and checks Traversal ISA.
+;    It does not rely on debug builds.
+;
+; Assertions are required because we check debug output:
+; REQUIRES: assertions
+;
+; RUN: amdllpc -gfxip 11.0 -emit-llvm -o - %s --debug-only='specialize-driver-shaders' 2>&1 | FileCheck %s
+
+[Version]
+version = 69
+
+[rgenGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : require
+
+struct RayPayload {
+  // The different fields test different cases of where values are set in which way.
+  // The yes/no comment indicates whether it should be constant-propagated into Traversal.
+  // For common constants of value$i, we use 0xbeef$i.
+  uint value1; // yes: same constant among all TraceRay sites and AHS (0xbeef1)
+  uint value2; // no:  different between TraceRay sites, ignored by AHS
+  uint value3; // yes: unset in TraceRay call sites, set by AHS
+  uint value4; // yes: unset in TraceRay call sites, conditionally set by AHS
+  uint value5; // no:  same constant in TraceRay call sites, modified by AHS
+  uint value6; // no:  constant in RayGen, dynamic in CHit, ignored by AHS
+  uint value7; // no:  dynamic RayGen, constant in CHit, ignored by AHS
+};
+
+layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh;
+layout(binding = 1, set = 0, rgba32f) uniform image2D g_dst;
+
+layout(location = 14) rayPayloadEXT RayPayload g_ray;
+
+// CHECK-LABEL: [SDS] Finished analysis of function _rgen_1
+// CHECK-NEXT: [SDS]
+// Capture the column headers of the arg slot table in the next line.
+// When analyzing the traversal shader, check that the number of arg slots matches.
+// This should be the case because the payload should use the top arg slots in both cases.
+// Verifying this means it is likely that we succeeded in aligning arguments.
+// Don't require a specific number because it depends on GpuRt system data.
+// Example line with 42 arg slots:
+//     [SDS] 012345678901234567890123456789012345678901
+// CHECK-NEXT: [SDS] [[ARG_SLOTS:[0-9]+]]{{$}}
+// Check that the payload is detected as expected:
+// CHECK-NEXT: [SDS] {{[CUDP]+}}CCUUCCD{{$}}
+
+void main() {
+  vec3 origin;
+  origin.x = gl_LaunchIDEXT.x;
+  origin.y = gl_LaunchIDEXT.y;
+  origin.z = 0;
+  g_ray.value1 = 0xbeef1;
+  g_ray.value2 = 0x1beef2;
+  // g_ray.value3 = unset;
+  // g_ray.value4 = unset
+  g_ray.value5 = 0xbeef5;
+  g_ray.value6 = 0xbeef6;
+  g_ray.value7 = gl_LaunchIDEXT.x;
+
+  float tMin = intBitsToFloat(0xdeadbeef);
+
+  traceRayEXT(g_bvh, /* ray flags */ 0, /* cull mask */ 0xff,
+              /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0,
+              origin.xyz, tMin, /* direction */ vec3(1, 0, 0),
+              /* tmax */ 48.0, /* payload location */ 14);
+
+  imageStore(g_dst, ivec2(gl_LaunchIDEXT.xy), vec4(0, 0, 0, 0));
+}
+
+[rgenInfo]
+entryPoint = main
+
+[chitGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : require
+
+struct RayPayload {
+  // The different fields test different cases of where values are set in which way.
+  // The yes/no comment indicates whether it should be constant-propagated into Traversal.
+  // For common constants of value$i, we use 0xbeef$i.
+  uint value1; // yes: same constant among all TraceRay sites and AHS (0xbeef1)
+  uint value2; // no:  different between TraceRay sites, ignored by AHS
+  uint value3; // yes: unset in TraceRay call sites, set by AHS
+  uint value4; // yes: unset in TraceRay call sites, conditionally set by AHS
+  uint value5; // no:  same constant in TraceRay call sites, modified by AHS
+  uint value6; // no:  constant in RayGen, dynamic in CHit, ignored by AHS
+  uint value7; // no:  dynamic RayGen, constant in CHit, ignored by AHS
+};
+
+layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh;
+layout(shaderRecordEXT, std430) buffer sbt {
+  float z;
+};
+layout(location = 14) rayPayloadEXT RayPayload g_ray;
+rayPayloadInEXT RayPayload g_ray_incoming;
+
+hitAttributeEXT vec2 g_hit;
+
+// CHECK-LABEL: [SDS] Finished analysis of function _chit_2
+// CHECK-NEXT: [SDS]
+// CHECK-NEXT: [SDS] [[ARG_SLOTS]]{{$}}
+// Check that the payload is detected as expected:
+// CHECK-NEXT: [SDS] {{[CUDP]+}}CCUUCDC{{$}}
+
+void main() {
+  g_ray.value1 = 0xbeef1;
+  g_ray.value2 = 0x2beef2; // rgs sets 0x1beef2
+  // g_ray.value3 = unset;
+  // g_ray.value4 = unset
+  g_ray.value5 = 0xbeef5;
+  g_ray.value6 = g_ray_incoming.value1; // counts as dynamic
+  g_ray.value7 = 0xbeef7;
+
+  vec3 origin;
+  origin.x = gl_LaunchIDEXT.x;
+  origin.y = gl_LaunchIDEXT.y;
+  origin.z = 0;
+
+  float tMin = intBitsToFloat(0xdeadbeef);
+
+  traceRayEXT(g_bvh, /* ray flags */ 0, /* cull mask */ 0xff,
+              /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0,
+              origin.xyz, tMin, /* direction */ vec3(1, 0, 0),
+              /* tmax */ 48.0, /* payload location */ 14);
+}
+
+[chitInfo]
+entryPoint = main
+
+[ahitGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : require
+
+struct RayPayload {
+  // The different fields test different cases of where values are set in which way.
+  // The yes/no comment indicates whether it should be constant-propagated into Traversal.
+  // For common constants of value$i, we use 0xbeef$i.
+  uint value1; // yes: same constant among all TraceRay sites and AHS (0xbeef1)
+  uint value2; // no:  different between TraceRay sites, ignored by AHS
+  uint value3; // yes: unset in TraceRay call sites, set by AHS
+  uint value4; // yes: unset in TraceRay call sites, conditionally set by AHS
+  uint value5; // no:  same constant in TraceRay call sites, modified by AHS
+  uint value6; // no:  constant in RayGen, dynamic in CHit, ignored by AHS
+  uint value7; // no:  dynamic RayGen, constant in CHit, ignored by AHS
+};
+
+layout(shaderRecordEXT, std430) buffer sbt {
+  float z;
+};
+//layout(location = 14) rayPayloadEXT RayPayload g_ray;
+rayPayloadInEXT RayPayload g_ray;
+
+hitAttributeEXT vec2 g_hit;
+
+// CHECK-LABEL: [SDS] Finished analysis of function _ahit_3
+// CHECK-NEXT: [SDS]
+// CHECK-NEXT: [SDS] [[ARG_SLOTS]]{{$}}
+// Check that the payload is detected as expected:
+// CHECK-NEXT: [SDS] {{[CUDP]+}}CPCCDPP{{$}}
+
+void main() {
+  g_ray.value1 = 0xbeef1;
+  //g_ray.value2 = unset;
+  g_ray.value3 = 0xbeef3;
+  if (g_ray.value4 == 17) {
+    g_ray.value4 = 0xbeef4;
+  }
+  g_ray.value5 += 1;
+  // g_ray.value6 = unset;
+  // g_ray.value7 = unset;
+
+  // Ensure we are indeed an AHS:
+  ignoreIntersectionEXT;
+}
+
+[ahitInfo]
+entryPoint = main
+
+[sectGlsl]
+#version 460
+#extension GL_EXT_ray_tracing : enable
+
+// CHECK-LABEL: [SDS] Finished analysis of function _sect_4
+// CHECK-NEXT: [SDS]
+// For intersection, do not force the line to end with ARG_SLOTS, as intersection pessimistically preserves
+// payload VGPRs, and thus may see a larger number of args:
+// CHECK-NEXT: [SDS] [[ARG_SLOTS]]
+// Check that intersection pessimistically preserves 32 payload VGPRs:
+// CHECK-NEXT: [SDS] {{[CUDP]+}}PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+void main()
+{
+    reportIntersectionEXT(0.5, 0u);
+}
+
+[sectInfo]
+entryPoint = main
+
+; Check Traversal analysis
+; CHECK-LABEL: [SDS] Finished analysis of function _cs_
+; CHECK-NEXT: [SDS]
+; CHECK-NEXT: [SDS] [[ARG_SLOTS]]{{$}}
+; Check that at least 7 dwords at the end are preserved for the payload:
+; CHECK-NEXT: [SDS] {{[CUDP]+}}PPPPPPP{{$}}
+
+[ResourceMapping]
+userDataNode[0].visibility = 0xffffffff
+userDataNode[0].type = DescriptorTableVaPtr
+userDataNode[0].offsetInDwords = 0
+userDataNode[0].sizeInDwords = 1
+userDataNode[0].next[0].type = DescriptorConstBuffer
+userDataNode[0].next[0].offsetInDwords = 0
+userDataNode[0].next[0].sizeInDwords = 4
+userDataNode[0].next[0].set = 0x00000000
+userDataNode[0].next[0].binding = 0
+userDataNode[0].next[1].type = DescriptorImage
+userDataNode[0].next[1].offsetInDwords = 4
+userDataNode[0].next[1].sizeInDwords = 8
+userDataNode[0].next[1].set = 0x00000000
+userDataNode[0].next[1].binding = 1
+userDataNode[1].visibility = 0xffffffff
+userDataNode[1].type = DescriptorTableVaPtr
+userDataNode[1].offsetInDwords = 1
+userDataNode[1].sizeInDwords = 1
+userDataNode[1].next[0].type = DescriptorConstBufferCompact
+userDataNode[1].next[0].offsetInDwords = 0
+userDataNode[1].next[0].sizeInDwords = 2
+userDataNode[1].next[0].set = 0x0000005D
+userDataNode[1].next[0].binding = 17
+userDataNode[1].next[1].type = DescriptorConstBuffer
+userDataNode[1].next[1].offsetInDwords = 2
+userDataNode[1].next[1].sizeInDwords = 4
+userDataNode[1].next[1].set = 0x0000005D
+userDataNode[1].next[1].binding = 0
+userDataNode[1].next[2].type = DescriptorBuffer
+userDataNode[1].next[2].offsetInDwords = 6
+userDataNode[1].next[2].sizeInDwords = 4
+userDataNode[1].next[2].set = 0x0000005D
+userDataNode[1].next[2].binding = 1
+
+[RayTracingPipelineState]
+groups[0].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR
+groups[0].generalShader = 0
+groups[0].closestHitShader = -1
+groups[0].anyHitShader = -1
+groups[0].intersectionShader = -1
+groups[1].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR
+groups[1].closestHitShader = 1
+groups[1].anyHitShader = 2
+groups[2].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR
+groups[2].generalShader = -1
+groups[2].closestHitShader = -1
+groups[2].anyHitShader = -1
+groups[2].intersectionShader = 3
+maxRecursionDepth = 2
+indirectStageMask = 0xffffffff
+mode = 3
+rtState.bvhResDescSize = 4
+rtState.bvhResDesc[0] = 0
+rtState.bvhResDesc[1] = 2197815296
+rtState.bvhResDesc[2] = 4294967295
+rtState.bvhResDesc[3] = 2164261887
+rtState.nodeStrideShift = 7
+rtState.threadGroupSizeX = 8
+rtState.threadGroupSizeY = 4
+rtState.threadGroupSizeZ = 1
+rtState.rayQueryCsSwizzle = 1
+rtState.ldsStackSize = 16
+rtState.dispatchRaysThreadGroupSize = 32
+rtState.ldsSizePerThreadGroup = 65536
+rtState.outerTileSize = 4
+rtState.dispatchDimSwizzleMode = 0
+rtState.enableDispatchRaysInnerSwizzle = 1
+rtState.enableDispatchRaysOuterSwizzle = 1
+rtState.enableOptimalLdsStackSizeForIndirect = 1
+rtState.enableOptimalLdsStackSizeForUnified = 1
+payloadSizeMaxInLib = 28
+attributeSizeMaxInLib = 8
+hasPipelineLibrary = 1
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders_Isa.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders_Isa.pipe
new file mode 100644
index 0000000000..c3957b33e0
--- /dev/null
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continuations_SpecializeDriverShaders_Isa.pipe
@@ -0,0 +1,40 @@
+; Check that driver shader specialization (SDS) works on LLPC raytracing pipelines.
+;
+; This test consists of two files:
+;  * ./PipelineRays_Continuations_SpecializeDriverShaders.pipe
+;    This file defines the actual pipeline, and uses debug output to check analysis details.
+;    Thus, it only runs with assertions enabled.
+;  * ./PipelineRays_Continuations_SpecializeDriverShaders_Isa.pipe
+;    This file uses the above pipeline, and checks Traversal ISA.
+;    It does not rely on debug builds.
+;
+; RUN: amdllpc -gfxip 11.0 -filetype=asm -o - %S/PipelineRays_Continuations_SpecializeDriverShaders.pipe | FileCheck %s
+
+; Check Traversal ISA
+; Due to the way FileCheck works, and because we don't know in which order the backend will make use of the constants,
+; we have to check for the constants using DAG directives.
+; We'd like to also check that no unexpected constants show up, but adding NOT directives in between DAG directives would
+; fix an order of the DAG ones, so we only check that there are no unexpected constants before the first expected one,
+; and after the last expected one.
+;
+; CHECK-LABEL: .type	_cs_,@function
+; CHECK-NEXT: _cs_:
+;
+; CHECK-NOT: 0x{{0*}}beef2
+; CHECK-NOT: 0x{{0*}}beef5
+; CHECK-NOT: 0x{{0*}}beef6
+; CHECK-NOT: 0x{{0*}}beef7
+;
+; CHECK-DAG: 0x{{0*}}beef1
+; CHECK-DAG: 0x{{0*}}beef3
+; CHECK-DAG: 0x{{0*}}beef4
+; Common tMin value:
+; CHECK-DAG: 0x{{0*}}deadbeef
+;
+; CHECK-NOT: 0x{{0*}}beef2
+; CHECK-NOT: 0x{{0*}}beef5
+; CHECK-NOT: 0x{{0*}}beef6
+; CHECK-NOT: 0x{{0*}}beef7
+;
+; Ensure the above only applies to Traversal ISA:
+; CHECK-LABEL: .Lfunc_end{{.*}}:
diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
index a0dde93d4d..5bdbe31d34 100644
--- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
+++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
@@ -105,7 +105,7 @@ attribute[0].offset = 0
 ; SHADERTEST-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP9]], align 4, !invariant.load !11
 ; SHADERTEST-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i64 0
 ; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i64 1
-; SHADERTEST-NEXT:    [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[TMP19]], float [[TMP20]], <8 x i32> [[TMP17]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32{{(\.v8i32)?}}{{(\.v4i32)?}}(i32 15, float [[TMP19]], float [[TMP20]], <8 x i32> [[TMP17]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
 ; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP21]]) #[[ATTR5:[0-9]+]]
 ; SHADERTEST-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe
index 597d9d0166..caadbe8315 100644
--- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe
+++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe
@@ -9,7 +9,7 @@
 ; SHADERTEST: define dllexport amdgpu_ps { <4 x float>, i32 } @_amdgpu_ps_main({{.*}}, i32 inreg noundef %userdata3, i32 inreg noundef %userdata4, {{.*}})
 ; SHADERTEST: [[pushConst0:%[.a-zA-Z0-9]+]] = bitcast i32 %userdata3 to float
 ; SHADERTEST: [[pushConst1:%[.a-zA-Z0-9]+]] = bitcast i32 %userdata4 to float
-; SHADERTEST: @llvm.amdgcn.image.gather4.lz.2d.sl_v4f32i32s.f32({{.*}}, float [[pushConst0]], float [[pushConst1]], {{.*}})
+; SHADERTEST: @llvm.amdgcn.image.gather4.lz.2d.sl_v4f32i32s.f32{{(\.v8i32)?}}{{(\.v4i32)?}}({{.*}}, float [[pushConst0]], float [[pushConst1]], {{.*}})
 
 ; Check that those parameters are passed in as s2 and s3.
 ; SHADERTEST-LABEL: _amdgpu_ps_main:
diff --git a/llpc/tool/amdllpc.cpp b/llpc/tool/amdllpc.cpp
index b80cfa591c..0bf253bbac 100644
--- a/llpc/tool/amdllpc.cpp
+++ b/llpc/tool/amdllpc.cpp
@@ -235,7 +235,7 @@ cl::opt<bool> EnableScratchAccessBoundsChecks("enable-scratch-bounds-checks",
 // -enable-implicit-invariant-exports: allow implicit marking of position exports as invariant
 cl::opt<bool> EnableImplicitInvariantExports("enable-implicit-invariant-exports",
                                               cl::desc("Enable implicit marking of position exports as invariant"),
-                                              cl::init(true));
+                                              cl::init(false));
 
 // -enable-forceCsThreadIdSwizzling: force cs thread id swizzling
 cl::opt<bool> ForceCsThreadIdSwizzling("force-compute-shader-thread-id-swizzling",
@@ -526,6 +526,7 @@ static Result init(int argc, char *argv[], ICompiler *&compiler, ShaderCacheWrap
   // Create internal cache
   cache = ShaderCacheWrap::Create(argc, argv);
 
+  strcpy(argv[0], VkCompilerName); // The first argument is the client, modify it to Vulkan standalone compiler name
   Result result = ICompiler::Create(ParsedGfxIp, argc, argv, &compiler, cache);
   if (result != Result::Success)
     return result;
@@ -556,11 +557,16 @@ static Result init(int argc, char *argv[], ICompiler *&compiler, ShaderCacheWrap
 // @param [out] compileInfo : Compilation info of LLPC standalone tool
 static void initCompileInfo(CompileInfo *compileInfo) {
   compileInfo->gfxIp = ParsedGfxIp;
-  compileInfo->relocatableShaderElf = EnableRelocatableShaderElf;
-  compileInfo->robustBufferAccess = RobustBufferAccess;
-  compileInfo->scalarBlockLayout = ScalarBlockLayout;
-  compileInfo->scratchAccessBoundsChecks = EnableScratchAccessBoundsChecks;
-  compileInfo->enableImplicitInvariantExports = EnableImplicitInvariantExports;
+  if (EnableRelocatableShaderElf.getNumOccurrences())
+    compileInfo->relocatableShaderElf = EnableRelocatableShaderElf;
+  if (RobustBufferAccess.getNumOccurrences())
+    compileInfo->robustBufferAccess = RobustBufferAccess;
+  if (ScalarBlockLayout.getNumOccurrences())
+    compileInfo->scalarBlockLayout = ScalarBlockLayout;
+  if (EnableScratchAccessBoundsChecks.getNumOccurrences())
+    compileInfo->scratchAccessBoundsChecks = EnableScratchAccessBoundsChecks;
+  if (EnableImplicitInvariantExports.getNumOccurrences())
+    compileInfo->enableImplicitInvariantExports = EnableImplicitInvariantExports;
   compileInfo->bvhNodeStride = BvhNodeStride;
   compileInfo->enableColorExportShader = EnableColorExportShader;
 
@@ -760,6 +766,9 @@ static Error processInputs(ICompiler *compiler, InputSpecGroup &inputSpecs, bool
       compileInfo.pipelineType = VfxPipelineTypeCompute;
     } else {
       compileInfo.pipelineType = VfxPipelineTypeGraphics;
+
+      auto &info = compileInfo.gfxPipelineInfo;
+      info.iaState.patchControlPoints = 3;
     }
   }
 
diff --git a/llpc/tool/llpcCompilationUtils.cpp b/llpc/tool/llpcCompilationUtils.cpp
index 06c49d5422..d1bedd8817 100644
--- a/llpc/tool/llpcCompilationUtils.cpp
+++ b/llpc/tool/llpcCompilationUtils.cpp
@@ -96,12 +96,13 @@ namespace StandaloneCompiler {
 // @returns : Pointer to the allocated memory
 void *VKAPI_CALL allocateBuffer(void *instance, void *userData, size_t size) {
   (void)instance;
-  void *allocBuf = malloc(size);
+  auto allocOwner = std::make_unique<char[]>(size);
+  void *allocBuf = allocOwner.get();
   memset(allocBuf, 0, size);
 
   assert(userData);
-  auto *outBuf = reinterpret_cast<void **>(userData);
-  *outBuf = allocBuf;
+  auto *info = reinterpret_cast<CompileInfo *>(userData);
+  info->pipelineBufs.push_back(std::move(allocOwner));
   return allocBuf;
 }
 
@@ -115,11 +116,9 @@ void cleanupCompileInfo(CompileInfo *compileInfo) {
     // It will be freed when we close the VFX doc.
     if (!compileInfo->pipelineInfoFile)
       delete[] reinterpret_cast<const char *>(compileInfo->shaderModuleDatas[i].spirvBin.pCode);
-
-    free(compileInfo->shaderModuleDatas[i].shaderBuf);
   }
 
-  free(compileInfo->pipelineBuf);
+  compileInfo->pipelineBufs.clear();
 
   if (compileInfo->pipelineInfoFile)
     Vfx::vfxCloseDoc(compileInfo->pipelineInfoFile);
@@ -348,7 +347,7 @@ Error buildShaderModules(ICompiler *compiler, CompileInfo *compileInfo) {
     ShaderModuleBuildOut *shaderOut = &shaderModuleData.shaderOut;
 
     shaderInfo->pInstance = nullptr; // Placeholder, unused.
-    shaderInfo->pUserData = &shaderModuleData.shaderBuf;
+    shaderInfo->pUserData = compileInfo;
     shaderInfo->pfnOutputAlloc = allocateBuffer;
     shaderInfo->shaderBin = shaderModuleData.spirvBin;
 
diff --git a/llpc/tool/llpcCompilationUtils.h b/llpc/tool/llpcCompilationUtils.h
index e6afe5c463..d7ea7ed403 100644
--- a/llpc/tool/llpcCompilationUtils.h
+++ b/llpc/tool/llpcCompilationUtils.h
@@ -67,6 +67,8 @@
 namespace Llpc {
 namespace StandaloneCompiler {
 
+using Vkgc::optional_bool;
+
 // Represents the module info for a shader module.
 struct ShaderModuleData {
   Llpc::ShaderStage shaderStage;          // Shader stage
@@ -74,7 +76,6 @@ struct ShaderModuleData {
   Llpc::BinaryData spirvBin;              // SPIR-V binary codes
   Llpc::ShaderModuleBuildInfo shaderInfo; // Info to build shader modules
   Llpc::ShaderModuleBuildOut shaderOut;   // Output of building shader modules
-  void *shaderBuf;                        // Allocation buffer of building shader modules
   bool disableDoAutoLayout;               // Indicates whether to disable auto layout of descriptors
 };
 
@@ -88,23 +89,23 @@ struct CompileInfo {
   std::vector<uint32_t> fsOutputs;                                           // Fragment outputs
   llvm::SmallVector<StandaloneCompiler::ShaderModuleData> shaderModuleDatas; // ShaderModule Data
   Llpc::GraphicsPipelineBuildInfo gfxPipelineInfo;                           // Info to build graphics pipeline
-  Llpc::GraphicsPipelineBuildOut gfxPipelineOut;                             // Output of building graphics pipeline
+  llvm::SmallVector<Llpc::GraphicsPipelineBuildOut> gfxPipelineOut;          // Output of building graphics pipeline
   Llpc::ComputePipelineBuildInfo compPipelineInfo;                           // Info to build compute pipeline
   Llpc::ComputePipelineBuildOut compPipelineOut;                             // Output of building compute pipeline
   RayTracingPipelineBuildInfo rayTracePipelineInfo;                          // Info to build ray tracing pipeline
   RayTracingPipelineBuildOut rayTracingPipelineOut;                          // Output of building ray tracing pipeline
   unsigned bvhNodeStride;
-  void *pipelineBuf;                   // Allocation buffer of building pipeline
-  void *pipelineInfoFile;              // VFX-style file containing pipeline info
-  bool unlinked;                       // Whether to generate unlinked shader/part-pipeline ELF
-  bool relocatableShaderElf;           // Whether to enable relocatable shader compilation
-  bool scalarBlockLayout;              // Whether to enable scalar block layout
-  bool doAutoLayout;                   // Whether to auto layout descriptors
-  bool autoLayoutDesc;                 // Whether to automatically create descriptor layout based on resource usages
-  bool robustBufferAccess;             // Whether to enable robust buffer access
-  bool scratchAccessBoundsChecks;      // Whether to enable scratch access bounds checks
-  bool enableImplicitInvariantExports; // Whether to enable implicit marking of position exports as invariant
-  VfxPipelineType pipelineType;        // Pipeline type
+  llvm::SmallVector<std::unique_ptr<char[]>> pipelineBufs; // Allocation buffers of building pipeline
+  void *pipelineInfoFile;                                  // VFX-style file containing pipeline info
+  bool unlinked;                                           // Whether to generate unlinked shader/part-pipeline ELF
+  optional_bool relocatableShaderElf;                      // Whether to enable relocatable shader compilation
+  optional_bool scalarBlockLayout;                         // Whether to enable scalar block layout
+  bool doAutoLayout;                                       // Whether to auto layout descriptors
+  bool autoLayoutDesc;                     // Whether to automatically create descriptor layout based on resource usages
+  optional_bool robustBufferAccess;        // Whether to enable robust buffer access
+  optional_bool scratchAccessBoundsChecks; // Whether to enable scratch access bounds checks
+  optional_bool enableImplicitInvariantExports; // Whether to enable implicit marking of position exports as invariant
+  VfxPipelineType pipelineType;                 // Pipeline type
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768
   // Old version of the code
   std::optional<llvm::CodeGenOpt::Level> optimizationLevel; // The optimization level to pass the compiler
diff --git a/llpc/tool/llpcComputePipelineBuilder.cpp b/llpc/tool/llpcComputePipelineBuilder.cpp
index 120b0cccfd..d5a39ae4f7 100644
--- a/llpc/tool/llpcComputePipelineBuilder.cpp
+++ b/llpc/tool/llpcComputePipelineBuilder.cpp
@@ -126,13 +126,17 @@ Expected<BinaryData> ComputePipelineBuilder::buildComputePipeline() {
   }
 
   pipelineInfo->pInstance = nullptr; // Placeholder, unused.
-  pipelineInfo->pUserData = &compileInfo.pipelineBuf;
+  pipelineInfo->pUserData = &compileInfo;
   pipelineInfo->pfnOutputAlloc = allocateBuffer;
   pipelineInfo->unlinked = compileInfo.unlinked;
-  pipelineInfo->options.robustBufferAccess = compileInfo.robustBufferAccess;
-  pipelineInfo->options.enableRelocatableShaderElf = compileInfo.relocatableShaderElf;
-  pipelineInfo->options.scalarBlockLayout = compileInfo.scalarBlockLayout;
-  pipelineInfo->options.enableScratchAccessBoundsChecks = compileInfo.scratchAccessBoundsChecks;
+  if (compileInfo.robustBufferAccess.has_value())
+    pipelineInfo->options.robustBufferAccess = *compileInfo.robustBufferAccess;
+  if (compileInfo.relocatableShaderElf.has_value())
+    pipelineInfo->options.enableRelocatableShaderElf = *compileInfo.relocatableShaderElf;
+  if (compileInfo.scalarBlockLayout.has_value())
+    pipelineInfo->options.scalarBlockLayout = *compileInfo.scalarBlockLayout;
+  if (compileInfo.scratchAccessBoundsChecks.has_value())
+    pipelineInfo->options.enableScratchAccessBoundsChecks = *compileInfo.scratchAccessBoundsChecks;
   pipelineInfo->options.forceCsThreadIdSwizzling = compileInfo.compPipelineInfo.options.forceCsThreadIdSwizzling;
   pipelineInfo->options.overrideThreadGroupSizeX = compileInfo.compPipelineInfo.options.overrideThreadGroupSizeX;
   pipelineInfo->options.overrideThreadGroupSizeY = compileInfo.compPipelineInfo.options.overrideThreadGroupSizeY;
diff --git a/llpc/tool/llpcGraphicsPipelineBuilder.cpp b/llpc/tool/llpcGraphicsPipelineBuilder.cpp
index 90bf049ec4..f0d8cf724f 100644
--- a/llpc/tool/llpcGraphicsPipelineBuilder.cpp
+++ b/llpc/tool/llpcGraphicsPipelineBuilder.cpp
@@ -101,7 +101,6 @@ Error GraphicsPipelineBuilder::build() {
 Expected<BinaryData> GraphicsPipelineBuilder::buildGraphicsPipeline() {
   CompileInfo &compileInfo = getCompileInfo();
   GraphicsPipelineBuildInfo *pipelineInfo = &compileInfo.gfxPipelineInfo;
-  GraphicsPipelineBuildOut *pipelineOut = &compileInfo.gfxPipelineOut;
 
   // Fill pipeline shader info.
   // clang-format off
@@ -140,19 +139,20 @@ Expected<BinaryData> GraphicsPipelineBuilder::buildGraphicsPipeline() {
                          compileInfo.autoLayoutDesc);
 
   pipelineInfo->pInstance = nullptr; // Placeholder, unused.
-  pipelineInfo->pUserData = &compileInfo.pipelineBuf;
+  pipelineInfo->pUserData = &compileInfo;
   pipelineInfo->pfnOutputAlloc = allocateBuffer;
   pipelineInfo->unlinked = compileInfo.unlinked;
 
-  // NOTE: If number of patch control points is not specified, we set it to 3.
-  if (pipelineInfo->iaState.patchControlPoints == 0)
-    pipelineInfo->iaState.patchControlPoints = 3;
-
-  pipelineInfo->options.robustBufferAccess = compileInfo.robustBufferAccess;
-  pipelineInfo->options.enableRelocatableShaderElf = compileInfo.relocatableShaderElf;
-  pipelineInfo->options.scalarBlockLayout = compileInfo.scalarBlockLayout;
-  pipelineInfo->options.enableScratchAccessBoundsChecks = compileInfo.scratchAccessBoundsChecks;
-  pipelineInfo->options.enableImplicitInvariantExports = compileInfo.enableImplicitInvariantExports;
+  if (compileInfo.robustBufferAccess.has_value())
+    pipelineInfo->options.robustBufferAccess = *compileInfo.robustBufferAccess;
+  if (compileInfo.relocatableShaderElf.has_value())
+    pipelineInfo->options.enableRelocatableShaderElf = *compileInfo.relocatableShaderElf;
+  if (compileInfo.scalarBlockLayout.has_value())
+    pipelineInfo->options.scalarBlockLayout = *compileInfo.scalarBlockLayout;
+  if (compileInfo.scratchAccessBoundsChecks.has_value())
+    pipelineInfo->options.enableScratchAccessBoundsChecks = *compileInfo.scratchAccessBoundsChecks;
+  if (compileInfo.enableImplicitInvariantExports.has_value())
+    pipelineInfo->options.enableImplicitInvariantExports = *compileInfo.enableImplicitInvariantExports;
   if (compileInfo.optimizationLevel.has_value()) {
     pipelineInfo->options.optimizationLevel = static_cast<uint32_t>(compileInfo.optimizationLevel.value());
   }
@@ -162,56 +162,69 @@ Expected<BinaryData> GraphicsPipelineBuilder::buildGraphicsPipeline() {
   PipelineBuildInfo localPipelineInfo = {};
   localPipelineInfo.pGraphicsInfo = pipelineInfo;
   void *pipelineDumpHandle = runPreBuildActions(localPipelineInfo);
-  auto onExit = make_scope_exit([&] { runPostBuildActions(pipelineDumpHandle, {pipelineOut->pipelineBin}); });
+  auto onExit = make_scope_exit([&] {
+    std::vector<BinaryData> binaries;
+    for (const auto &out : compileInfo.gfxPipelineOut)
+      binaries.push_back(out.pipelineBin);
+    runPostBuildActions(pipelineDumpHandle, binaries);
+  });
 
   if (compileInfo.isGraphicsLibrary) {
     Result result = Result::Success;
+    GraphicsPipelineBuildOut pipelineOut = {};
     if (compileInfo.stageMask == 0) {
-      result = getCompiler().BuildColorExportShader(pipelineInfo, compileInfo.fsOutputs.data(), pipelineOut,
+      result = getCompiler().BuildColorExportShader(pipelineInfo, compileInfo.fsOutputs.data(), &pipelineOut,
                                                     pipelineDumpHandle);
 
     } else if (compileInfo.stageMask & Vkgc::ShaderStageBit::ShaderStageFragmentBit) {
       result =
-          getCompiler().buildGraphicsShaderStage(pipelineInfo, pipelineOut, UnlinkedStageFragment, pipelineDumpHandle);
+          getCompiler().buildGraphicsShaderStage(pipelineInfo, &pipelineOut, UnlinkedStageFragment, pipelineDumpHandle);
     } else {
-      result = getCompiler().buildGraphicsShaderStage(pipelineInfo, pipelineOut, UnlinkedStageVertexProcess,
+      result = getCompiler().buildGraphicsShaderStage(pipelineInfo, &pipelineOut, UnlinkedStageVertexProcess,
                                                       pipelineDumpHandle);
     }
 
     if (result != Result::Success)
       return createResultError(result, "Graphics pipeline compilation failed");
 
-    return pipelineOut->pipelineBin;
+    compileInfo.gfxPipelineOut.emplace_back(pipelineOut);
+    return pipelineOut.pipelineBin;
   }
 
   if (pipelineInfo->enableColorExportShader) {
-    Result result = getCompiler().buildGraphicsShaderStage(pipelineInfo, pipelineOut, UnlinkedStageVertexProcess,
+    GraphicsPipelineBuildOut pipelineOut = {};
+    Result result = getCompiler().buildGraphicsShaderStage(pipelineInfo, &pipelineOut, UnlinkedStageVertexProcess,
                                                            pipelineDumpHandle);
+    compileInfo.gfxPipelineOut.emplace_back(pipelineOut);
+
     if (result == Result::Success) {
-      free(compileInfo.pipelineBuf);
-      compileInfo.pipelineBuf = nullptr;
+      pipelineOut = {};
       result =
-          getCompiler().buildGraphicsShaderStage(pipelineInfo, pipelineOut, UnlinkedStageFragment, pipelineDumpHandle);
+          getCompiler().buildGraphicsShaderStage(pipelineInfo, &pipelineOut, UnlinkedStageFragment, pipelineDumpHandle);
+      compileInfo.gfxPipelineOut.emplace_back(pipelineOut);
     }
-    if (result == Result::Success && pipelineOut->fsOutputMetaData != nullptr) {
-      void *fsOuts = compileInfo.pipelineBuf;
-      compileInfo.pipelineBuf = nullptr;
-      result = getCompiler().BuildColorExportShader(pipelineInfo, pipelineOut->fsOutputMetaData, pipelineOut,
-                                                    pipelineDumpHandle);
-      free(fsOuts);
+
+    if (result == Result::Success && pipelineOut.fsOutputMetaData != nullptr) {
+      void *fsOutputMetadata = pipelineOut.fsOutputMetaData;
+
+      pipelineOut = {};
+      result = getCompiler().BuildColorExportShader(pipelineInfo, fsOutputMetadata, &pipelineOut, pipelineDumpHandle);
+      compileInfo.gfxPipelineOut.emplace_back(pipelineOut);
     }
 
     if (result != Result::Success)
       return createResultError(result, "Graphics pipeline compilation failed");
 
-    return pipelineOut->pipelineBin;
+    return pipelineOut.pipelineBin;
   }
 
-  Result result = getCompiler().BuildGraphicsPipeline(pipelineInfo, pipelineOut, pipelineDumpHandle);
+  GraphicsPipelineBuildOut pipelineOut = {};
+  Result result = getCompiler().BuildGraphicsPipeline(pipelineInfo, &pipelineOut, pipelineDumpHandle);
   if (result != Result::Success)
     return createResultError(result, "Graphics pipeline compilation failed");
 
-  return pipelineOut->pipelineBin;
+  compileInfo.gfxPipelineOut.emplace_back(pipelineOut);
+  return pipelineOut.pipelineBin;
 }
 
 // =====================================================================================================================
@@ -231,7 +244,13 @@ uint64_t GraphicsPipelineBuilder::getPipelineHash(Vkgc::PipelineBuildInfo buildI
 Error GraphicsPipelineBuilder::outputElfs(const StringRef suppliedOutFile) {
   CompileInfo &compileInfo = getCompileInfo();
   const InputSpec &firstInput = compileInfo.inputSpecs.front();
-  return outputElf(compileInfo.gfxPipelineOut.pipelineBin, suppliedOutFile, firstInput.filename);
+
+  for (const auto &pipelineOut : compileInfo.gfxPipelineOut) {
+    if (Error err = outputElf(pipelineOut.pipelineBin, suppliedOutFile, firstInput.filename))
+      return err;
+  }
+
+  return Error::success();
 }
 
 } // namespace StandaloneCompiler
diff --git a/llpc/tool/llpcRayTracingPipelineBuilder.cpp b/llpc/tool/llpcRayTracingPipelineBuilder.cpp
index c2d882927c..8abff93ba8 100644
--- a/llpc/tool/llpcRayTracingPipelineBuilder.cpp
+++ b/llpc/tool/llpcRayTracingPipelineBuilder.cpp
@@ -103,9 +103,10 @@ Expected<SmallVector<BinaryData>> RayTracingPipelineBuilder::buildRayTracingPipe
                          compileInfo.autoLayoutDesc);
 
   pipelineInfo->pInstance = nullptr; // Dummy, unused
-  pipelineInfo->pUserData = &compileInfo.pipelineBuf;
+  pipelineInfo->pUserData = &compileInfo;
   pipelineInfo->pfnOutputAlloc = allocateBuffer;
-  pipelineInfo->options.robustBufferAccess = compileInfo.robustBufferAccess;
+  if (compileInfo.robustBufferAccess.has_value())
+    pipelineInfo->options.robustBufferAccess = *compileInfo.robustBufferAccess;
   pipelineInfo->rtState.nodeStrideShift = Log2_32(compileInfo.bvhNodeStride);
   pipelineInfo->shaderCount = compileInfo.shaderModuleDatas.size();
 
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
index ac0b198805..b2d5937f95 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
@@ -909,6 +909,7 @@ Type *SPIRVToLLVM::transTypeWithOpcode<spv::OpTypeStruct>(SPIRVType *const spvTy
   if (hasSamplerOrNested) {
     SPIRVTypeContext ctx(spvType, matrixStride, isColumnMajor, layout);
     m_imageTypeMap[ctx.asTuple()] = imageStructType;
+    m_hasSamplerInStruct = true;
   }
 
   if (usePadding)
@@ -1003,6 +1004,66 @@ Type *SPIRVToLLVM::transType(SPIRVType *t, unsigned matrixStride, bool columnMaj
   return res;
 }
 
+// =====================================================================================================================
+// Translate SPIR-V type to one or more LLVM types
+//
+// @param t : SPIR-V type to translate
+// @param matrixStride : Stride between columns for matrix types (in bytes)
+// @param columnMajor : Whether matrix is in column-major order (true) or row-major order (false)
+// @param layout : Layout mode for the type (e.g., scalar, vector, matrix)
+// @returns : Vector of translated LLVM types
+SmallVector<Type *> SPIRVToLLVM::transMultiTypes(SPIRVType *t, unsigned matrixStride, bool columnMajor,
+                                                 LayoutMode layout) {
+  SmallVector<Type *> types;
+  SPIRVTypeContext ctx(t, matrixStride, columnMajor, layout);
+  auto it = m_fullTypeMap.find(ctx.asTuple());
+  if (it == m_fullTypeMap.end()) {
+    auto res = transTypeImpl(t, matrixStride, columnMajor, layout);
+    m_fullTypeMap[ctx.asTuple()] = res;
+    types.push_back(res);
+  } else {
+    auto iterMultiTypes = m_multiTypesMap.find(it->second);
+    if (iterMultiTypes != m_multiTypesMap.end()) {
+      // Save the type of non-Image
+      types.push_back(iterMultiTypes->first);
+
+      // Save the type of image
+      types.push_back(iterMultiTypes->second);
+    } else {
+      types.push_back(it->second);
+    }
+  }
+
+  return types;
+}
+
+// =====================================================================================================================
+// Map a single SPIR-V type to multiple LLVM types
+//
+// @param pt : LLVM type to be mapped
+// @param t : Corresponding SPIR-V type
+// @param matrixStride : Stride between columns for matrix types (in bytes)
+// @param columnMajor : Whether matrix is in column-major order (true) or row-major order (false)
+// @param layout : Layout mode for the type (e.g., scalar, vector, matrix)
+// @returns : True if mapping was successful, false otherwise
+bool SPIRVToLLVM::mapMultiTypes(Type *pt, SPIRVType *t, unsigned matrixStride, bool columnMajor, LayoutMode layout) {
+  if (m_multiTypesMap.find(pt) != m_multiTypesMap.end())
+    return true;
+
+  SPIRVTypeContext ctx(t, matrixStride, columnMajor, layout);
+  auto itNonImg = m_fullTypeMap.find(ctx.asTuple());
+  auto itImg = m_imageTypeMap.find(ctx.asTuple());
+
+  if (itNonImg != m_fullTypeMap.end() && itImg != m_imageTypeMap.end()) {
+    // When translate a struct type variable, the type non-image part will be translated to i8 as a placeholder,
+    // it will be replaced by the real image type after calling transImagePointer()
+    m_multiTypesMap[pt] = itImg->second;
+    return true;
+  }
+
+  return false;
+}
+
 Type *SPIRVToLLVM::transTypeImpl(SPIRVType *t, unsigned matrixStride, bool columnMajor, LayoutMode layout) {
   t->validate();
   switch (t->getOpCode()) {
@@ -1016,8 +1077,14 @@ Type *SPIRVToLLVM::transTypeImpl(SPIRVType *t, unsigned matrixStride, bool colum
     auto ft = static_cast<SPIRVTypeFunction *>(t);
     auto rt = transType(ft->getReturnType());
     std::vector<Type *> pt;
-    for (size_t i = 0, e = ft->getNumParameters(); i != e; ++i)
-      pt.push_back(transType(ft->getParameterType(i)));
+    for (size_t i = 0, e = ft->getNumParameters(); i != e; ++i) {
+      SPIRVType *paramType = ft->getParameterType(i);
+
+      // Function param will be translated to multi-types if it is a struct and it contains a sampler
+      auto types = transMultiTypes(paramType);
+      for (size_t k = 0; k < types.size(); ++k)
+        pt.push_back(types[k]);
+    }
     return FunctionType::get(rt, pt, false);
   }
   case OpTypeImage:
@@ -1553,16 +1620,22 @@ static unsigned getMatrixPartDim(Type *type) {
 // Create a GEP to an element of a row-major matrix. Return the pointer to the element and its alignment, assuming the
 // given matrix alignment.
 std::pair<Value *, Align> SPIRVToLLVM::createGepIntoRowMajorMatrix(Type *matrixType, Value *matrixPtr,
-                                                                   Align matrixAlign, Value *row, Value *col) {
+                                                                   Align matrixAlign, Value *row, Value *col,
+                                                                   bool inBounds) {
   // The matrix type is [nrows x {[ncols x T], pad}]
   Value *zero = m_builder->getInt32(0);
   Value *indices[] = {zero, row ? row : zero, zero, col ? col : zero};
-  Value *pointer = m_builder->CreateGEP(matrixType, matrixPtr, indices);
+  GEPNoWrapFlags flags = inBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none();
+  Value *pointer = m_builder->CreateGEP(matrixType, matrixPtr, indices, "", flags);
   Align align = matrixAlign;
   if (auto *gep = dyn_cast<GEPOperator>(pointer)) {
     const DataLayout &dl = m_m->getDataLayout();
     unsigned bitWidth = dl.getIndexSizeInBits(gep->getPointerAddressSpace());
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 513542
     MapVector<Value *, APInt> variableOffsets;
+#else
+    SmallMapVector<Value *, APInt, 4> variableOffsets;
+#endif
     APInt constantOffset{bitWidth, 0};
     bool success = gep->collectOffset(dl, bitWidth, variableOffsets, constantOffset);
     (void)success;
@@ -1704,7 +1777,8 @@ bool SPIRVToLLVM::postProcessRowMajorMatrix() {
               if (!row)
                 row = m_builder->getInt32(loadRow);
 
-              auto [ptr, align] = createGepIntoRowMajorMatrix(matrixType, matrix, load->getAlign(), row, col);
+              auto [ptr, align] =
+                  createGepIntoRowMajorMatrix(matrixType, matrix, load->getAlign(), row, col, /*inBounds=*/true);
               LoadInst *const newLoad = m_builder->CreateAlignedLoad(matrixElementType, ptr, align, load->isVolatile());
               newLoad->setOrdering(load->getOrdering());
               newLoad->setSyncScopeID(load->getSyncScopeID());
@@ -1781,7 +1855,8 @@ bool SPIRVToLLVM::postProcessRowMajorMatrix() {
                   element = m_builder->CreateExtractElement(column, storeRow);
               }
 
-              auto [ptr, align] = createGepIntoRowMajorMatrix(matrixType, matrix, store->getAlign(), row, col);
+              auto [ptr, align] =
+                  createGepIntoRowMajorMatrix(matrixType, matrix, store->getAlign(), row, col, /*inBounds=*/true);
               StoreInst *const newStore = m_builder->CreateAlignedStore(element, ptr, align, store->isVolatile());
               newStore->setOrdering(store->getOrdering());
               newStore->setSyncScopeID(store->getSyncScopeID());
@@ -1811,10 +1886,10 @@ bool SPIRVToLLVM::postProcessRowMajorMatrix() {
           Value *rhs = cmpInst->getOperand(1);
           auto [lhsRow, lhsCol] = valueMap.lookup(lhs);
           if (lhsRow || lhsCol)
-            lhs = createGepIntoRowMajorMatrix(matrixType, matrix, {}, lhsRow, lhsCol).first;
+            lhs = createGepIntoRowMajorMatrix(matrixType, matrix, {}, lhsRow, lhsCol, /*inBounds=*/false).first;
           auto [rhsRow, rhsCol] = valueMap.lookup(rhs);
           if (rhsRow || rhsCol)
-            rhs = createGepIntoRowMajorMatrix(matrixType, matrix, {}, rhsRow, rhsCol).first;
+            rhs = createGepIntoRowMajorMatrix(matrixType, matrix, {}, rhsRow, rhsCol, /*inBounds=*/false).first;
 
           Value *newCmpInst = m_builder->CreateCmp(cmpInst->getPredicate(), lhs, rhs, cmpInst->getName());
           cmpInst->replaceAllUsesWith(newCmpInst);
@@ -1920,6 +1995,10 @@ Value *SPIRVToLLVM::addLoadInstRecursively(SPIRVType *const spvType, Value *load
     loadPointer = loadPair.second;
   }
 
+  const auto addrSpace = loadPointer->getType()->getPointerAddressSpace();
+  const bool useSGep = addrSpace == SPIRAS_Input ||
+                       // TCS may read output variables. See more: OpenGL 4.6 spec 11.2.1 Tessellation Control Shaders
+                       (m_bm->getExecutionModel() == ExecutionModelTessellationControl && addrSpace == SPIRAS_Output);
   Constant *const zero = getBuilder()->getInt32(0);
   if (loadType->isStructTy() && !spvType->isTypeSampledImage() && !spvType->isTypeImage() &&
       !spvType->isTypeSampler() && spvType->getOpCode() != OpTypeRayQueryKHR
@@ -1936,7 +2015,8 @@ Value *SPIRVToLLVM::addLoadInstRecursively(SPIRVType *const spvType, Value *load
 
       SmallVector<Value *, 2> indices = {zero, getBuilder()->getInt32(memberIndex)};
 
-      Value *memberLoadPointer = getBuilder()->CreateGEP(loadType, loadPointer, indices);
+      Value *memberLoadPointer = useSGep ? getBuilder()->create<StructuralGepOp>(loadPointer, loadType, false, indices)
+                                         : getBuilder()->CreateGEP(loadType, loadPointer, indices);
 
       Type *memberLoadType = nullptr;
 
@@ -1986,7 +2066,8 @@ Value *SPIRVToLLVM::addLoadInstRecursively(SPIRVType *const spvType, Value *load
       if (needsPad)
         indices.push_back(zero);
 
-      Value *elementLoadPointer = getBuilder()->CreateGEP(loadType, loadPointer, indices);
+      Value *elementLoadPointer = useSGep ? getBuilder()->create<StructuralGepOp>(loadPointer, loadType, false, indices)
+                                          : getBuilder()->CreateGEP(loadType, loadPointer, indices);
       Type *const elementLoadType = GetElementPtrInst::getIndexedType(loadType, indices);
       Value *const elementLoad = addLoadInstRecursively(spvElementType, elementLoadPointer, elementLoadType, isVolatile,
                                                         isCoherent, isNonTemporal);
@@ -2063,21 +2144,21 @@ void SPIRVToLLVM::addStoreInstRecursively(SPIRVType *const spvType, Value *store
   // the alignment is greater than 1 (if the constant is storing an entire structure, because we have to use packed
   // structs to encoded layout information from SPIR-V into LLVM, we can very easily output large stores with align 1
   // that causes problems with the load/store vectorizer and DAG combining).
-  if (isa<Constant>(storeValue) && alignment > 1) {
+  // Note: do not special case coherent variables because the backend supports atomic stores with simple types only, so
+  // the store needs to be split.
+  if (isa<Constant>(storeValue) && alignment > 1 && !isCoherent) {
     Constant *const constStoreValue =
         buildConstStoreRecursively(spvType, storePointer->getType(), storeType, cast<Constant>(storeValue));
 
     StoreInst *const store = getBuilder()->CreateAlignedStore(constStoreValue, storePointer, alignment, isVolatile);
 
-    if (isCoherent)
-      store->setAtomic(AtomicOrdering::Unordered);
-
     if (isNonTemporal)
       transNonTemporalMetadata(store);
 
     return;
   }
 
+  const bool useSGep = storePointer->getType()->getPointerAddressSpace() == SPIRAS_Output;
   Value *const zero = getBuilder()->getInt32(0);
   if (storeType->isStructTy() && !spvType->isTypeSampledImage() && !spvType->isTypeImage() &&
       !spvType->isTypeSampler() && spvType->getOpCode() != OpTypeRayQueryKHR) {
@@ -2087,7 +2168,9 @@ void SPIRVToLLVM::addStoreInstRecursively(SPIRVType *const spvType, Value *store
     for (unsigned i = 0, memberCount = spvType->getStructMemberCount(); i < memberCount; i++) {
       const unsigned memberIndex = needsPad ? lookupRemappedTypeElements(spvType, i) : i;
       Value *indices[] = {zero, getBuilder()->getInt32(memberIndex)};
-      Value *const memberStorePointer = getBuilder()->CreateGEP(storeType, storePointer, indices);
+      Value *const memberStorePointer =
+          useSGep ? getBuilder()->create<StructuralGepOp>(storePointer, storeType, false, indices)
+                  : getBuilder()->CreateGEP(storeType, storePointer, indices);
       Type *const memberStoreType = GetElementPtrInst::getIndexedType(storeType, indices);
       Value *const memberStoreValue = getBuilder()->CreateExtractValue(storeValue, i);
       addStoreInstRecursively(spvType->getStructMemberType(i), memberStorePointer, memberStoreType, memberStoreValue,
@@ -2104,11 +2187,11 @@ void SPIRVToLLVM::addStoreInstRecursively(SPIRVType *const spvType, Value *store
       SmallVector<Value *, 3> indices;
       indices.push_back(zero);
       indices.push_back(getBuilder()->getInt32(i));
-
       if (needsPad)
         indices.push_back(zero);
-
-      Value *const elementStorePointer = getBuilder()->CreateGEP(storeType, storePointer, indices);
+      Value *const elementStorePointer =
+          useSGep ? getBuilder()->create<StructuralGepOp>(storePointer, storeType, false, indices)
+                  : getBuilder()->CreateGEP(storeType, storePointer, indices);
       Type *const elementStoreType = GetElementPtrInst::getIndexedType(storeType, indices);
       Value *const elementStoreValue = getBuilder()->CreateExtractValue(storeValue, i);
       addStoreInstRecursively(spvElementType, elementStorePointer, elementStoreType, elementStoreValue, isVolatile,
@@ -3334,7 +3417,9 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
         }
       }
 
-      if (inBound)
+      if (pointerStorageClass == StorageClassInput || pointerStorageClass == StorageClassOutput)
+        base = getBuilder()->create<StructuralGepOp>(base, basePointeeType, inBound, gepIndices);
+      else if (inBound)
         base = getBuilder()->CreateInBoundsGEP(basePointeeType, base, gepIndices);
       else
         base = getBuilder()->CreateGEP(basePointeeType, base, gepIndices);
@@ -3381,7 +3466,6 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
         break;
       }
       case OpTypeArray:
-      case OpTypeRuntimeArray: {
         gepIndices.push_back(index);
 
         if (typeMaybeRemapped && isRemappedTypeElements(spvAccessElementType)) {
@@ -3390,6 +3474,30 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
           gepIndices.push_back(getBuilder()->getInt32(0));
         }
 
+        spvAccessElementType = spvAccessElementType->getArrayElementType();
+        break;
+      case OpTypeRuntimeArray: {
+        bool isRemapped = typeMaybeRemapped && isRemappedTypeElements(spvAccessElementType);
+        auto *globalBase = dyn_cast<GlobalVariable>(base);
+        bool isDescriptorArray = globalBase && globalBase->getValueType()->isArrayTy();
+        if (!isDescriptorArray && base->getType() == getBuilder()->getPtrTy(ADDR_SPACE_BUFFER_FAT_POINTER)) {
+          Type *const arrayTy = GetElementPtrInst::getIndexedType(basePointeeType, gepIndices);
+          flushGep();
+          uint32_t stride = m_m->getDataLayout().getTypeAllocSize(arrayTy->getArrayElementType());
+          basePointeeType = arrayTy->getArrayElementType();
+          if (isRemapped) {
+            basePointeeType = basePointeeType->getStructElementType(0);
+          }
+          base = getBuilder()->create<BufferIndexOp>(base, stride, index);
+        } else {
+          gepIndices.push_back(index);
+
+          if (isRemapped) {
+            // If we have padding in an array, we inserted a struct to add that
+            // padding, and so we need an extra constant 0 index.
+            gepIndices.push_back(getBuilder()->getInt32(0));
+          }
+        }
         spvAccessElementType = spvAccessElementType->getArrayElementType();
         break;
       }
@@ -4863,9 +4971,14 @@ SmallVector<Value *> SPIRVToLLVM::transValueMultiWithOpcode<OpVariable>(SPIRVVal
     getBuilder()->SetInsertPointPastAllocas(f);
     values.push_back(transImagePointer(spvVar, spvVar->getMemObjType()));
 
-    // Append const value 0 as default offset if this variable is struct type with image member.
-    if (itNonImage->second)
-      values.push_back(getBuilder()->getInt32(0));
+    if (itNonImage->second != nullptr) {
+      auto nonImgType = itNonImage->second->getType();
+      auto itTypes = m_multiTypesMap.find(nonImgType);
+      if (itTypes != m_multiTypesMap.end()) {
+        // Update the image type by the real type { ptr addrspace(4), i32, i32, ptr addrspace(4), i32, i32 }
+        itTypes->second = values[1]->getType();
+      }
+    }
   }
 
   m_variableMap.try_emplace({spvValue, f}, values);
@@ -4907,6 +5020,17 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) {
 
   Type *const varType = transType(spvVarType, 0, true, layout);
 
+  if (m_hasSamplerInStruct && storageClass == StorageClassUniformConstant) {
+    if (mapMultiTypes(ptrType, spvVarType, 0, true, layout)) {
+      // Record the spvType, so that we can know which argument need to map to multi-values when
+      // translate a function
+      m_spvMultiTypesMap.insert(spvVar->getType());
+    }
+
+    // reset the state
+    m_hasSamplerInStruct = false;
+  }
+
   SPIRVValue *const spvInitializer = spvVar->getInitializer();
   Constant *initializer = nullptr;
 
@@ -5225,7 +5349,7 @@ Value *SPIRVToLLVM::transString(const SPIRVString *spvValue) {
 // |  f16     |  f16     |   Y   |  Y  |
 // |  bf16    |  bf16    |   Y   |  N  |
 // |  iu8     |  i32     |   Y   |  Y  |
-// |  iu4     |  i32     |   Y   |  N  |
+// |  iu4     |  i32     |   Y   |  Y  |
 // For integer types, arbitrary signedness combinations are supported for the
 // A/B matrices.C/D matrices are always signed.
 
@@ -5265,21 +5389,6 @@ lgc::CooperativeMatrixElementType SPIRVToLLVM::mapToBasicType(SPIRVType *const e
   return basicTy;
 }
 
-lgc::CooperativeMatrixLayout SPIRVToLLVM::getLayout(lgc::CooperativeMatrixElementType elemType) {
-  const Vkgc::GfxIpVersion gfxIp = getPipelineContext()->getGfxIpVersion();
-
-  if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 32)) {
-    if (gfxIp.major == 11)
-      return lgc::CooperativeMatrixLayout::AccumulatorMatrixLayout;
-    return lgc::CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout;
-  }
-  if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 16) || BuilderCommon::isTypeNCooperativeMatrix(elemType, 8))
-    return lgc::CooperativeMatrixLayout::FactorMatrixLayout;
-
-  llvm_unreachable("The element type is not supported!");
-  return lgc::CooperativeMatrixLayout::InvalidLayout;
-}
-
 // =====================================================================================================================
 // Mapping the use to layout
 // @param use : CooperativeMatrixUse value.
@@ -5292,6 +5401,7 @@ lgc::CooperativeMatrixLayout SPIRVToLLVM::getCooperativeMatrixKHRLayout(Cooperat
   const Vkgc::GfxIpVersion gfxIp = getPipelineContext()->getGfxIpVersion();
   if (use == CooperativeMatrixUse::CooperativeMatrixUseMatrixAKHR ||
       use == CooperativeMatrixUse::CooperativeMatrixUseMatrixBKHR) {
+
     return lgc::CooperativeMatrixLayout::FactorMatrixLayout;
   }
   if (use == CooperativeMatrixUse::CooperativeMatrixUseMatrixAccumulatorKHR) {
@@ -5404,6 +5514,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixLoadKHR>
   // by the component size).
   Type *elementllType = getBuilder()->transCooperativeMatrixElementType(elemType);
   unsigned elementSize = static_cast<unsigned>(m_m->getDataLayout().getTypeSizeInBits(elementllType) / 8);
+  elementSize = std::max(elementSize, (unsigned)1);
   unsigned alignmentInRowCol = (isColMajor ? rows : columns) * elementSize;
   unsigned loadAlignment = std::min((unsigned)16, alignmentInRowCol);
   lgc::CooperativeMatrixLayout layout = getCooperativeMatrixKHRLayout(use, elemType, rows, columns);
@@ -5502,6 +5613,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixStoreKHR
   // (depending on ColumnMajor) of the matrix (where the natural alignment is the number of columns/rows multiplied
   // by the component size).
   unsigned elementSize = static_cast<unsigned>(m_m->getDataLayout().getTypeSizeInBits(elemltType) / 8);
+  elementSize = std::max(elementSize, (unsigned)1);
   unsigned alignmentInRowCol = (isColMajor ? rows : columns) * elementSize;
   unsigned storeAlignment = std::min((unsigned)16, alignmentInRowCol);
   getBuilder()->create<CooperativeMatrixStoreOp>(pointer, stride, isColMajor, elemType, layout, memoryAccess,
@@ -5531,7 +5643,6 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixMulAddKH
   bool isSignedB = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixBSigned());
   bool isSat = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixSatAccumulation());
 
-  // Current SPIRV does not supported fp8 or bf8 yet, so the types of A and B use the same value.
   Value *coopMatrixD = getBuilder()->create<CooperativeMatrixMulAddOp>(
       coopMatrixC->getType(), coopMatrixA, coopMatrixB, coopMatrixC, isSignedA, isSignedB, isSat, 0, elemBasicTypeA,
       elemBasicTypeA, elemBasicTypeC, "mulAdd");
@@ -6173,8 +6284,9 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
     SPIRVFunctionCall *bc = static_cast<SPIRVFunctionCall *>(bv);
     SmallVector<Value *, 8> args;
     for (SPIRVValue *bArg : bc->getArgumentValues()) {
-      Value *arg = transValue(bArg, f, bb);
-      args.push_back(arg);
+      // Translate the argument to multi-values if it's a struct and contains a sampler
+      auto values = transValueMulti(bArg, f, bb);
+      args.append(values);
     }
     auto call = CallInst::Create(transFunction(bc->getFunction()), args, "", bb);
     setCallingConv(call);
@@ -6994,16 +7106,30 @@ Function *SPIRVToLLVM::transFunction(SPIRVFunction *bf) {
     f->addFnAttr(Attribute::AlwaysInline);
   }
 
-  for (Function::arg_iterator i = f->arg_begin(), e = f->arg_end(); i != e; ++i) {
-    auto ba = bf->getArgument(i->getArgNo());
-    mapValue(ba, &(*i));
-    setName(&(*i), ba);
+  for (size_t i = 0, realArgIndex = 0; i < bf->getNumArguments(); ++i, ++realArgIndex) {
+    auto ba = bf->getArgument(i);
+    SPIRVType *paramType = bf->getFunctionType()->getParameterType(i);
+
+    Value *args[2] = {f->getArg(realArgIndex), nullptr};
+
+    // If the parameter is a structure and it contains a sampler, two arguments will be passed
+    if (m_spvMultiTypesMap.find(paramType) != m_spvMultiTypesMap.end()) {
+      assert(realArgIndex + 1 < f->arg_size());
+      args[1] = f->getArg(++realArgIndex);
+      mapValue(ba, ArrayRef<Value *>(args, 2));
+    } else {
+      mapValue(ba, args[0]);
+    }
+
+    setName(args[0], ba);
 
     SPIRVWord maxOffset = 0;
     if (ba->hasDecorate(DecorationMaxByteOffset, 0, &maxOffset)) {
       AttrBuilder builder(*m_context);
       builder.addDereferenceableAttr(maxOffset);
-      i->addAttrs(builder);
+      cast<Argument>(args[0])->addAttrs(builder);
+      if (args[1])
+        cast<Argument>(args[1])->addAttrs(builder);
     }
   }
 
@@ -7059,6 +7185,11 @@ Function *SPIRVToLLVM::transFunction(SPIRVFunction *bf) {
     Type *irArgTy = f->getArg(i)->getType();
     auto argTy = bf->getArgument(i)->getType();
     argTys.push_back(getContArgTy(irArgTy, argTy));
+
+    // If the arg is translated to multiple types, set the image type as well
+    auto itImg = m_multiTypesMap.find(irArgTy);
+    if (itImg != m_multiTypesMap.end())
+      argTys.push_back(getContArgTy(itImg->second, argTy));
   }
   Type *irRetTy = f->getFunctionType()->getReturnType();
   TypedFuncTy funcTys(getContArgTy(irRetTy, bf->getType()), argTys);
@@ -7270,6 +7401,15 @@ static void scanImageDescNonUniformCV(SPIRVToLLVM::ExtractedImageInfo *info, SPI
 
     bool isAccessChain = opcode == OpAccessChain || opcode == OpInBoundsAccessChain;
     if (isAccessChain) {
+      std::vector<SPIRVValue *> operands = static_cast<SPIRVInstruction *>(spvValue)->getOperands();
+      for (SPIRVValue *operand : operands) {
+        if (operand->hasDecorate(DecorationNonUniformEXT)) {
+          if (image)
+            info->flags |= lgc::Builder::ImageFlagNonUniformImage;
+          if (sampler)
+            info->flags |= lgc::Builder::ImageFlagNonUniformSampler;
+        }
+      }
       spvValue = static_cast<SPIRVInstruction *>(spvValue)->getOperands()[0];
       continue;
     }
@@ -8591,6 +8731,9 @@ bool SPIRVToLLVM::translate(ExecutionModel entryExecModel, const char *entryName
     }
   }
 
+  for (SPIRVExtInst *EI : m_bm->getDebugInstVec())
+    compilationUnit = m_dbgTran.transDebugInst(EI);
+
   if (m_scratchBoundsChecksEnabled) {
     // Insert the scratch out of bounds checks for any feasible memory instruction. The SPIRV to LLVM memop mapping
     // gets filled while translating OpLoads and OpStores.
@@ -8794,9 +8937,11 @@ bool SPIRVToLLVM::transMetadata() {
           meshMode.workgroupSizeZ = overrideShaderGroupSizeZ;
         }
 
-        if (bf->getExecutionMode(ExecutionModeDerivativeGroupQuadsNV))
+        if (bf->getExecutionMode(ExecutionModeDerivativeGroupQuadsNV) ||
+            bf->getExecutionMode(ExecutionModeDerivativeGroupQuadsKHR))
           meshMode.derivativeMode = DerivativeMode::Quads;
-        else if (bf->getExecutionMode(ExecutionModeDerivativeGroupLinearNV))
+        else if (bf->getExecutionMode(ExecutionModeDerivativeGroupLinearNV) ||
+                 bf->getExecutionMode(ExecutionModeDerivativeGroupLinearKHR))
           meshMode.derivativeMode = DerivativeMode::Linear;
         else
           meshMode.derivativeMode = DerivativeMode::None;
@@ -8895,12 +9040,13 @@ bool SPIRVToLLVM::transMetadata() {
 
         ComputeShaderMode computeMode = {};
 
-        if (bf->getExecutionMode(ExecutionModeDerivativeGroupQuadsNV))
+        if (bf->getExecutionMode(ExecutionModeDerivativeGroupQuadsKHR))
           computeMode.derivativeMode = DerivativeMode::Quads;
-        else if (bf->getExecutionMode(ExecutionModeDerivativeGroupLinearNV))
+        else if (bf->getExecutionMode(ExecutionModeDerivativeGroupLinearKHR))
           computeMode.derivativeMode = DerivativeMode::Linear;
         else
           computeMode.derivativeMode = DerivativeMode::None;
+
         if (bf->getExecutionMode(ExecutionModeQuadDerivativesKHR))
           computeMode.derivativeMode = DerivativeMode::Quads;
 
@@ -10262,7 +10408,8 @@ Value *SPIRVToLLVM::transGLSLExtInst(SPIRVExtInst *extInst, BasicBlock *bb) {
 
   case GLSLstd450PackHalf2x16: {
     // Convert <2 x float> into <2 x half> then pack into i32.
-    Value *val = getBuilder()->CreateFPTrunc(args[0], FixedVectorType::get(getBuilder()->getHalfTy(), 2));
+    Value *val = getBuilder()->CreateFpTruncWithRounding(args[0], FixedVectorType::get(getBuilder()->getHalfTy(), 2),
+                                                         RoundingMode::TowardZero);
     return getBuilder()->CreateBitCast(val, getBuilder()->getInt32Ty());
   }
 
@@ -10542,6 +10689,18 @@ void SPIRVToLLVM::transMemFence(BasicBlock *bb, SPIRVWord memSema, SPIRVWord mem
   if (ordering == AtomicOrdering::NotAtomic)
     return;
 
+  // Downgrade ScopeDevice to ScopeWorkgroup if memory semantics permits it.
+  // If memory semantics implies that shared memory is local to a workgroup, no need for ScopeDevice that would mean all
+  // workgroups in the device.
+  // Check that no memory semantics other than MemorySemanticsSubgroupMemoryMask or MemorySemanticsWorkgroupMemoryMask
+  // are set.
+  if (memScope == ScopeDevice &&
+      (memSema & (MemorySemanticsUniformMemoryMask | MemorySemanticsCrossWorkgroupMemoryMask |
+                  MemorySemanticsAtomicCounterMemoryMask | MemorySemanticsImageMemoryMask |
+                  MemorySemanticsOutputMemoryMask)) == MemorySemanticsMaskNone) {
+    memScope = ScopeWorkgroup;
+  }
+
   SyncScope::ID scope = SyncScope::System;
 
   switch (memScope) {
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.h b/llpc/translator/lib/SPIRV/SPIRVReader.h
index 0ef2965404..b8f6e4ea94 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.h
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.h
@@ -116,6 +116,13 @@ class SPIRVToLLVM {
 
   Type *transType(SPIRVType *bt, unsigned matrixStride = 0, bool columnMajor = true,
                   LayoutMode layout = LayoutMode::None);
+
+  SmallVector<Type *> transMultiTypes(SPIRVType *bt, unsigned matrixStride = 0, bool columnMajor = true,
+                                      LayoutMode layout = LayoutMode::None);
+
+  bool mapMultiTypes(Type *pt, SPIRVType *t, unsigned matrixStride = 0, bool columnMajor = true,
+                     LayoutMode layout = LayoutMode::None);
+
   template <spv::Op>
   Type *transTypeWithOpcode(SPIRVType *bt, unsigned matrixStride, bool columnMajor, LayoutMode layout);
   Type *transTypeArray(SPIRVType *bt, unsigned matrixStride, bool columnMajor, LayoutMode layout);
@@ -230,7 +237,7 @@ class SPIRVToLLVM {
   Constant *buildConstStoreRecursively(SPIRVType *const, Type *const, Type *const, Constant *const);
 
   std::pair<Value *, Align> createGepIntoRowMajorMatrix(Type *matrixType, Value *matrixPtr, Align matrixAlign,
-                                                        Value *row, Value *col);
+                                                        Value *row, Value *col, bool inBounds);
 
   // Post-process translated LLVM module to undo row major matrices.
   bool postProcessRowMajorMatrix();
@@ -296,6 +303,8 @@ class SPIRVToLLVM {
   SPIRVToLLVMDbgTran m_dbgTran;
   DenseMap<std::pair<SPIRVValue *, Function *>, SmallVector<Value *>> m_variableMap;
   DenseMap<SPIRVValue *, Value *> m_variableNonImageMap;
+  DenseMap<Type *, Type *> m_multiTypesMap;
+  DenseSet<SPIRVType *> m_spvMultiTypesMap;
 
   // Hash map with correlation between (SPIR-V) OpAccessChain and its returned (dereferenced) type.
   // We have to store base type because opaque-pointers are removing information about dereferenced type.
@@ -318,7 +327,7 @@ class SPIRVToLLVM {
 
   bool m_maximallyReconverges = false;
   bool m_hasDemoteToHelper = false;
-
+  bool m_hasSamplerInStruct = false;
   enum class LlvmMemOpType : uint8_t { IS_LOAD, IS_STORE };
   struct ScratchBoundsCheckData {
     LlvmMemOpType memOpType;
@@ -328,7 +337,6 @@ class SPIRVToLLVM {
 
   lgc::CooperativeMatrixElementType mapToBasicType(Type *const ltType);
   lgc::CooperativeMatrixElementType mapToBasicType(SPIRVType *const spvType);
-  lgc::CooperativeMatrixLayout getLayout(lgc::CooperativeMatrixElementType elemTy);
   lgc::CooperativeMatrixLayout getCooperativeMatrixKHRLayout(CooperativeMatrixUse use,
                                                              lgc::CooperativeMatrixElementType elemTy, unsigned rows,
                                                              unsigned columns);
diff --git a/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp b/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp
index 3eb1299c25..0a4907836a 100644
--- a/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp
+++ b/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp
@@ -308,7 +308,7 @@ DICompositeType *SPIRVToLLVMDbgTran::transTypeComposite(const SPIRVExtInst *Debu
   StringRef Name = getString(Ops[NameIdx]);
   DIFile *File = getFile(Ops[SourceIdx]);
   unsigned LineNo = getConstant(Ops[LineIdx]);
-  DIScope *ParentScope = getScope(BM->getEntry(Ops[ParentIdx]));
+  DIScope *ParentScope = getScope(BM->getEntry(Ops[ScopeIdx]));
 
   uint64_t Size = 0;
   SPIRVEntry *SizeEntry = BM->getEntry(Ops[SizeIdx]);
@@ -327,7 +327,7 @@ DICompositeType *SPIRVToLLVMDbgTran::transTypeComposite(const SPIRVExtInst *Debu
   DINode::DIFlags Flags = mapToDIFlags(getConstant(Ops[FlagsIdx]));
 
   DICompositeType *CT = nullptr;
-  switch (Ops[TagIdx]) {
+  switch (getConstant(Ops[TagIdx])) {
   case SPIRVDebug::Class:
     CT = Builder.createClassType(ParentScope, Name, File, LineNo, Size, Align, 0, Flags, DerivedFrom,
                                  DINodeArray() /*elements*/,
@@ -367,7 +367,7 @@ DINode *SPIRVToLLVMDbgTran::transTypeMember(const SPIRVExtInst *DebugInst) {
   DIFile *File = getFile(Ops[SourceIdx]);
   unsigned LineNo = getConstant(Ops[LineIdx]);
   StringRef Name = getString(Ops[NameIdx]);
-  DIScope *Scope = getScope(BM->getEntry(Ops[ParentIdx]));
+  DIScope *Scope = getScope(BM->getEntry(DebugInst->getScope()));
   DIType *BaseType = transDebugInst<DIType>(BM->get<SPIRVExtInst>(Ops[TypeIdx]));
   uint64_t OffsetInBits = BM->get<SPIRVConstant>(Ops[OffsetIdx])->getZExtIntValue();
 
@@ -590,8 +590,8 @@ MDNode *SPIRVToLLVMDbgTran::transGlobalVariable(const SPIRVExtInst *DebugInst) {
   if (Ops.size() > MinOperandCount) {
     StaticMemberDecl = transDebugInst<DIDerivedType>(BM->get<SPIRVExtInst>(Ops[StaticMemberDeclarationIdx]));
   }
-  bool IsLocal = Ops[FlagsIdx] & SPIRVDebug::FlagIsLocal;
-  bool IsDefinition = Ops[FlagsIdx] & SPIRVDebug::FlagIsDefinition;
+  bool IsLocal = getConstant(Ops[FlagsIdx]) & SPIRVDebug::FlagIsLocal;
+  bool IsDefinition = getConstant(Ops[FlagsIdx]) & SPIRVDebug::FlagIsDefinition;
   MDNode *VarDecl = nullptr;
   if (IsDefinition) {
     VarDecl = Builder.createGlobalVariableExpression(Parent, Name, LinkageName, File, LineNo, Ty, IsLocal, IsDefinition,
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRV.debug.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRV.debug.h
index c8b461a5f9..e0f3676091 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRV.debug.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRV.debug.h
@@ -221,7 +221,7 @@ enum {
   SourceIdx       = 2,
   LineIdx         = 3,
   ColumnIdx       = 4,
-  ParentIdx       = 5,
+  ScopeIdx        = 5,
   LinkageNameIdx  = 6,
   SizeIdx         = 7,
   FlagsIdx        = 8,
@@ -237,12 +237,11 @@ enum {
   SourceIdx       = 2,
   LineIdx         = 3,
   ColumnIdx       = 4,
-  ParentIdx       = 5,
-  OffsetIdx       = 6,
-  SizeIdx         = 7,
-  FlagsIdx        = 8,
-  ValueIdx        = 9,
-  MinOperandCount = 9
+  OffsetIdx       = 5,
+  SizeIdx         = 6,
+  FlagsIdx        = 7,
+  ValueIdx        = 8,
+  MinOperandCount = 8
 };
 }
 
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEnum.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEnum.h
index 4d7fbd0288..e3cc0b9f3b 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEnum.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVEnum.h
@@ -213,6 +213,8 @@ template <> inline void SPIRVMap<SPIRVCapabilityKind, SPIRVCapVec>::init() {
   ADD_VEC_INIT(CapabilityCooperativeMatrixKHR, {CapabilityShader});
   ADD_VEC_INIT(CapabilityComputeDerivativeGroupLinearNV, {CapabilityShader});
   ADD_VEC_INIT(CapabilityComputeDerivativeGroupQuadsNV, {CapabilityShader});
+  ADD_VEC_INIT(CapabilityComputeDerivativeGroupLinearKHR, {CapabilityShader});
+  ADD_VEC_INIT(CapabilityComputeDerivativeGroupQuadsKHR, {CapabilityShader});
   ADD_VEC_INIT(CapabilityQuadControlKHR, {CapabilityShader});
 }
 
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 90ab0639be..dcfce39db2 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -1544,22 +1544,26 @@ class SPIRVExtInst : public SPIRVFunctionCallGeneric<OpExtInst, 5> {
 public:
   SPIRVExtInst(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheBuiltinSet, SPIRVWord TheEntryPoint,
                const std::vector<SPIRVWord> &TheArgs, SPIRVBasicBlock *BB)
-      : SPIRVFunctionCallGeneric(TheType, TheId, TheArgs, BB), ExtSetId(TheBuiltinSet), ExtOp(TheEntryPoint) {
+      : SPIRVFunctionCallGeneric(TheType, TheId, TheArgs, BB), ExtSetId(TheBuiltinSet), ScopeId(SPIRVWORD_MAX),
+        ExtOp(TheEntryPoint) {
     setExtSetKindById();
     validate();
   }
   SPIRVExtInst(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheBuiltinSet, SPIRVWord TheEntryPoint,
                const std::vector<SPIRVValue *> &TheArgs, SPIRVBasicBlock *BB)
-      : SPIRVFunctionCallGeneric(TheType, TheId, TheArgs, BB), ExtSetId(TheBuiltinSet), ExtOp(TheEntryPoint) {
+      : SPIRVFunctionCallGeneric(TheType, TheId, TheArgs, BB), ExtSetId(TheBuiltinSet), ScopeId(SPIRVWORD_MAX),
+        ExtOp(TheEntryPoint) {
     setExtSetKindById();
     validate();
   }
   SPIRVExtInst(SPIRVExtInstSetKind SetKind = SPIRVEIS_Count, unsigned ExtOC = SPIRVWORD_MAX)
-      : ExtSetId(SPIRVWORD_MAX), ExtOp(ExtOC), ExtSetKind(SetKind) {}
+      : ExtSetId(SPIRVWORD_MAX), ScopeId(SPIRVWORD_MAX), ExtOp(ExtOC), ExtSetKind(SetKind) {}
   void setExtSetId(unsigned Set) { ExtSetId = Set; }
   void setExtOp(unsigned ExtOC) { ExtOp = ExtOC; }
+  void setScope(unsigned scope) { ScopeId = scope; }
   SPIRVId getExtSetId() const { return ExtSetId; }
   SPIRVWord getExtOp() const { return ExtOp; }
+  SPIRVWord getScope() const { return ScopeId; }
   SPIRVExtInstSetKind getExtSetKind() const { return ExtSetKind; }
   void setExtSetKindById() {
     assert(Module && "Invalid module");
@@ -1612,6 +1616,13 @@ class SPIRVExtInst : public SPIRVFunctionCallGeneric<OpExtInst, 5> {
         unsigned dbgCol = Module->get<SPIRVConstant>(Args[3])->getZExtIntValue();
         SPIRVLine *line = Module->add(new SPIRVLine(Module, Args[0], dbgLn, dbgCol));
         Module->setCurrentLine(line);
+      } else if (ExtOpDebug == NonSemanticShaderDebugInfo100DebugTypeComposite) {
+        using namespace SPIRVDebug::Operand::TypeComposite;
+        // The parent Scope of the member is implicit from DebugTypeComposite lists.
+        for (unsigned idx = FirstMemberIdx; idx < Args.size(); idx++) {
+          auto member = static_cast<SPIRVExtInst *>(Module->getEntry(Args[idx]));
+          member->setScope(Args[ScopeIdx]);
+        }
       }
     }
   }
@@ -1623,6 +1634,7 @@ class SPIRVExtInst : public SPIRVFunctionCallGeneric<OpExtInst, 5> {
 
 protected:
   SPIRVId ExtSetId;
+  SPIRVWord ScopeId;
   union {
     SPIRVWord ExtOp;
     GLSLExtOpKind ExtOpGLSL;
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
index 5a82f5cd8c..8e25023b0e 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
@@ -569,8 +569,8 @@ inline bool isValid(spv::Capability V) {
   case CapabilityWorkgroupMemoryExplicitLayoutKHR:
   case CapabilityWorkgroupMemoryExplicitLayout8BitAccessKHR:
   case CapabilityWorkgroupMemoryExplicitLayout16BitAccessKHR:
-  case CapabilityComputeDerivativeGroupLinearNV:
-  case CapabilityComputeDerivativeGroupQuadsNV:
+  case CapabilityComputeDerivativeGroupLinearKHR:
+  case CapabilityComputeDerivativeGroupQuadsKHR:
   case CapabilityExpectAssumeKHR:
   case CapabilityGroupNonUniformRotateKHR:
   case CapabilityQuadControlKHR:
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index eeff4e8256..8e408b42d8 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -467,7 +467,9 @@ void SPIRVModuleImpl::layoutEntry(SPIRVEntry *E) {
     SPIRVExtInst *EI = static_cast<SPIRVExtInst *>(E);
     if ((EI->getExtSetKind() == SPIRVEIS_Debug || EI->getExtSetKind() == SPIRVEIS_NonSemanticShaderDebugInfo100) &&
         EI->getExtOp() != SPIRVDebug::Declare && EI->getExtOp() != SPIRVDebug::Value &&
-        EI->getExtOp() != SPIRVDebug::Scope && EI->getExtOp() != SPIRVDebug::NoScope) {
+        EI->getExtOp() != SPIRVDebug::Scope && EI->getExtOp() != SPIRVDebug::NoScope &&
+        EI->getExtOp() != SPIRVDebug::Line && EI->getExtOp() != SPIRVDebug::NoLine &&
+        EI->getExtOp() != SPIRVDebug::FunctionDefinition) {
       DebugInstVec.push_back(EI);
     }
     break;
diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
index 4a01d3ac88..49e92c312e 100644
--- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
@@ -512,6 +512,8 @@ template <> inline void SPIRVMap<Capability, std::string>::init() {
   add(CapabilityCooperativeMatrixKHR, "CooperativeMatrixKHR");
   add(CapabilityComputeDerivativeGroupLinearNV, "ComputeDerivativeGroupLinearNV");
   add(CapabilityComputeDerivativeGroupQuadsNV, "ComputeDerivativeGroupQuadsNV");
+  add(CapabilityComputeDerivativeGroupLinearKHR, "ComputeDerivativeGroupLinearKHR");
+  add(CapabilityComputeDerivativeGroupQuadsKHR, "ComputeDerivativeGroupQuadsKHR");
   add(CapabilityExpectAssumeKHR, "ExpectAssumeKHR");
   add(CapabilityGroupNonUniformRotateKHR, "GroupNonUniformRotateKHR");
   add(CapabilityQuadControlKHR, "QuadControlKHR");
diff --git a/llpc/unittests/context/testOptLevel.cpp b/llpc/unittests/context/testOptLevel.cpp
index 48acfdd5bd..6b83373b66 100644
--- a/llpc/unittests/context/testOptLevel.cpp
+++ b/llpc/unittests/context/testOptLevel.cpp
@@ -63,7 +63,7 @@ TEST(LlpcContextTests, MatchPipelineOptLevel) {
     GraphicsPipelineBuildInfo pipelineInfo = {};
     pipelineInfo.options.optimizationLevel = static_cast<uint32_t>(optLevel);
 
-    GraphicsContext graphicsContext(GfxIp, &pipelineInfo, &pipelineHash, &cacheHash);
+    GraphicsContext graphicsContext(GfxIp, "Vulkan", &pipelineInfo, &pipelineHash, &cacheHash);
 
     context.attachPipelineContext(&graphicsContext);
 
@@ -97,7 +97,7 @@ TEST(LlpcContextTests, MatchPipelineOptLevel) {
     ComputePipelineBuildInfo pipelineInfo = {};
     pipelineInfo.options.optimizationLevel = static_cast<uint32_t>(optLevel);
 
-    ComputeContext computeContext(GfxIp, &pipelineInfo, &pipelineHash, &cacheHash);
+    ComputeContext computeContext(GfxIp, "Vulkan", &pipelineInfo, &pipelineHash, &cacheHash);
 
     context.attachPipelineContext(&computeContext);
 
diff --git a/llpc/util/llpcShaderModuleHelper.cpp b/llpc/util/llpcShaderModuleHelper.cpp
index c9acab887b..fb0739d120 100644
--- a/llpc/util/llpcShaderModuleHelper.cpp
+++ b/llpc/util/llpcShaderModuleHelper.cpp
@@ -29,10 +29,13 @@
 ***********************************************************************************************************************
 */
 #include "llpcShaderModuleHelper.h"
+#include "SPIRVEntry.h"
+#include "SPIRVFunction.h"
+#include "SPIRVInstruction.h"
+#include "SPIRVModule.h"
 #include "llpcDebug.h"
 #include "llpcError.h"
 #include "llpcUtil.h"
-#include "spirvExt.h"
 #include "vkgcUtil.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
@@ -42,6 +45,7 @@
 using namespace llvm;
 using namespace MetroHash;
 using namespace spv;
+using namespace SPIRV;
 using namespace Util;
 
 using Vkgc::SpirvHeader;
@@ -56,245 +60,256 @@ opt<bool> TrimDebugInfo("trim-debug-info", cl::desc("Trim debug information in S
 } // namespace llvm
 
 namespace Llpc {
+
 // =====================================================================================================================
-// Returns the shader module usage for the given Spir-V binary.
+// Returns the shader module usage for the given SPIR-V module.
 //
-// @param spvBinCode : SPIR-V binary data
+// @param module : SPIR-V module
 // @returns : Shader module usage info
-ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(const BinaryData *spvBinCode) {
-  const unsigned *code = reinterpret_cast<const unsigned *>(spvBinCode->pCode);
-  const unsigned *end = code + spvBinCode->codeSize / sizeof(unsigned);
-  const unsigned *codePos = code + sizeof(SpirvHeader) / sizeof(unsigned);
-
+ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(SPIRVModule *module) {
+  assert(module);
   ShaderModuleUsage shaderModuleUsage = {};
-  // Parse SPIR-V instructions
-  std::unordered_set<unsigned> capabilities;
-  bool hasIndexDecoration = false;
 
-  while (codePos < end) {
-    unsigned opCode = (codePos[0] & OpCodeMask);
-    unsigned wordCount = (codePos[0] >> WordCountShift);
-    assert(wordCount > 0 && codePos + wordCount <= end && "Invalid SPIR-V binary\n");
-
-    // Parse each instruction and find those we are interested in
-    switch (opCode) {
-    case OpCapability: {
-      assert(wordCount == 2);
-      auto capability = static_cast<Capability>(codePos[1]);
-      capabilities.insert(capability);
+  // Helper to set corresponding usage based on the specified built-in
+  auto processBuiltIn = [&](BuiltIn builtIn, bool structMember) {
+    switch (builtIn) {
+    case BuiltInPointSize:
+      // NOTE: When any member of gl_PerVertex is used, its other members will be added to SPIR-V in the annotation
+      // section. We are unable to determine their actual usage unless we parse the AccessChain instruction.
+      if (!structMember)
+        shaderModuleUsage.usePointSize = true;
       break;
-    }
-    case OpExtInst: {
-      auto extInst = static_cast<GLSLstd450>(codePos[4]);
-      switch (extInst) {
-      case GLSLstd450InterpolateAtSample:
-        shaderModuleUsage.useSampleInfo = true;
-        break;
-      case GLSLstd450NMin:
-      case GLSLstd450NMax:
-        shaderModuleUsage.useIsNan = true;
-        break;
-      default:
-        break;
-      }
+    case BuiltInPrimitiveShadingRateKHR:
+    case BuiltInShadingRateKHR:
+      shaderModuleUsage.useShadingRate = true;
       break;
-    }
-    case OpExtension: {
-      StringRef extName = reinterpret_cast<const char *>(&codePos[1]);
-      if (extName == "SPV_AMD_shader_ballot") {
-        shaderModuleUsage.useSubgroupSize = true;
-      }
+    case BuiltInSamplePosition:
+      shaderModuleUsage.useSampleInfo = true;
       break;
-    }
-    case OpExecutionMode: {
-      auto execMode = static_cast<ExecutionMode>(codePos[2]);
-      switch (execMode) {
-      case ExecutionModeOriginUpperLeft:
-        shaderModuleUsage.originUpperLeft = true;
-        break;
-      case ExecutionModePixelCenterInteger:
-        shaderModuleUsage.pixelCenterInteger = true;
-        break;
-      case ExecutionModeXfb:
-        shaderModuleUsage.enableXfb = true;
-      default: {
-        break;
-      }
-      }
+    case BuiltInFragCoord:
+      shaderModuleUsage.useFragCoord = true;
+      break;
+    case BuiltInViewportIndex:
+    case BuiltInPointCoord:
+    case BuiltInLayer:
+      shaderModuleUsage.useGenericBuiltIn = true;
+      break;
+    case BuiltInClipDistance:
+    case BuiltInCullDistance:
+      // NOTE: When any member of gl_PerVertex is used, its other members will be added to SPIR-V in the annotation
+      // section. We are unable to determine their actual usage unless we parse the AccessChain instruction.
+      if (!structMember)
+        shaderModuleUsage.useGenericBuiltIn = true;
+      break;
+    case BuiltInBaryCoordKHR:
+    case BuiltInBaryCoordNoPerspKHR:
+      shaderModuleUsage.useBarycentric = true;
+      break;
+    case BuiltInLaunchIdKHR:
+      shaderModuleUsage.rtSystemValueUsage.ray.launchId = true;
+      break;
+    case BuiltInLaunchSizeKHR:
+      shaderModuleUsage.rtSystemValueUsage.ray.launchSize = true;
+      break;
+    case BuiltInWorldRayOriginKHR:
+      shaderModuleUsage.rtSystemValueUsage.ray.worldRayOrigin = true;
+      break;
+    case BuiltInWorldRayDirectionKHR:
+      shaderModuleUsage.rtSystemValueUsage.ray.worldRayDirection = true;
+      break;
+    case BuiltInIncomingRayFlagsKHR:
+      shaderModuleUsage.rtSystemValueUsage.ray.flags = true;
+      break;
+    case BuiltInRayTminKHR:
+      shaderModuleUsage.rtSystemValueUsage.ray.tMin = true;
+      break;
+    case BuiltInHitTNV:
+      shaderModuleUsage.rtSystemValueUsage.ray.tCurrent = true;
+      break;
+    case BuiltInObjectRayOriginKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.objectRayOrigin = true;
+      break;
+    case BuiltInObjectRayDirectionKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.objectRayDirection = true;
+      break;
+    case BuiltInPrimitiveId:
+      shaderModuleUsage.useGenericBuiltIn = true;
+      shaderModuleUsage.rtSystemValueUsage.primitive.primitiveIndex = true;
+      break;
+    case BuiltInInstanceId:
+      shaderModuleUsage.rtSystemValueUsage.primitive.instanceID = true;
+      break;
+    case BuiltInInstanceCustomIndexKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.instanceIndex = true;
+      break;
+    case BuiltInObjectToWorldKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.objectToWorld = true;
+      break;
+    case BuiltInWorldToObjectKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.worldToObject = true;
+      break;
+    case BuiltInHitKindKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.hitKind = true;
+      break;
+    case BuiltInHitTriangleVertexPositionsKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.hitTrianglePosition = true;
+      break;
+    case BuiltInRayGeometryIndexKHR:
+      shaderModuleUsage.rtSystemValueUsage.primitive.geometryIndex = true;
+      break;
+    default:
       break;
     }
-    case OpDecorate:
-    case OpMemberDecorate: {
-      auto decoration =
-          (opCode == OpDecorate) ? static_cast<Decoration>(codePos[2]) : static_cast<Decoration>(codePos[3]);
-      if (decoration == DecorationInvariant) {
-        shaderModuleUsage.useInvariant = true;
+  };
+
+  // Set usage relevant to constants
+  for (unsigned i = 0; i < module->getNumConstants(); ++i) {
+    auto constant = module->getConstant(i);
+
+    // Built-in decoration could be applied to constant
+    SPIRVWord builtIn = SPIRVWORD_MAX;
+    if (constant->hasDecorate(DecorationBuiltIn, 0, &builtIn))
+      processBuiltIn(static_cast<BuiltIn>(builtIn), false);
+
+    if (constant->getOpCode() == OpSpecConstantTrue || constant->getOpCode() == OpSpecConstantFalse ||
+        constant->getOpCode() == OpSpecConstant || constant->getOpCode() == OpSpecConstantComposite ||
+        constant->getOpCode() == OpSpecConstantOp)
+      shaderModuleUsage.useSpecConstant = true;
+  }
+
+  // Set usage relevant to variables
+  bool hasIndexDecorate = false;
+
+  for (unsigned i = 0; i < module->getNumVariables(); ++i) {
+    auto variable = module->getVariable(i);
+    if (variable->hasDecorate(DecorationIndex))
+      hasIndexDecorate = true;
+
+    if (variable->hasDecorate(DecorationInvariant))
+      shaderModuleUsage.useInvariant = true;
+
+    // Built-in decoration applied to variable
+    SPIRVWord builtIn = SPIRVWORD_MAX;
+    if (variable->hasDecorate(DecorationBuiltIn, 0, &builtIn))
+      processBuiltIn(static_cast<BuiltIn>(builtIn), false);
+
+    auto variableType = variable->getType()->getPointerElementType(); // Dereference to variable value type
+    if (variableType && variableType->isTypeStruct()) {
+      // Struct type, built-in decoration could be applied to struct member
+      for (unsigned j = 0; j < variableType->getStructMemberCount(); ++j) {
+        if (variableType->hasMemberDecorate(j, DecorationInvariant))
+          shaderModuleUsage.useInvariant = true;
+
+        builtIn = SPIRVWORD_MAX;
+        if (variableType->hasMemberDecorate(j, DecorationBuiltIn, 0, &builtIn))
+          processBuiltIn(static_cast<BuiltIn>(builtIn), true);
       }
-      if (decoration == DecorationBuiltIn) {
-        auto builtIn = (opCode == OpDecorate) ? static_cast<BuiltIn>(codePos[3]) : static_cast<BuiltIn>(codePos[4]);
-        switch (builtIn) {
-        case BuiltInPointSize: {
-          shaderModuleUsage.usePointSize = true;
-          break;
-        }
-        case BuiltInPrimitiveShadingRateKHR:
-        case BuiltInShadingRateKHR: {
-          shaderModuleUsage.useShadingRate = true;
-          break;
-        }
-        case BuiltInSamplePosition: {
-          shaderModuleUsage.useSampleInfo = true;
-          break;
-        }
-        case BuiltInFragCoord: {
-          shaderModuleUsage.useFragCoord = true;
-          break;
-        }
-        case BuiltInViewportIndex:
-        case BuiltInPointCoord:
-        case BuiltInLayer:
-        case BuiltInClipDistance:
-        case BuiltInCullDistance: {
-          shaderModuleUsage.useGenericBuiltIn = true;
-          break;
-        }
-        case BuiltInBaryCoordKHR:
-        case BuiltInBaryCoordNoPerspKHR: {
-          shaderModuleUsage.useBarycentric = true;
-          break;
-        }
-        case BuiltInPrimitiveId: {
-          shaderModuleUsage.useGenericBuiltIn = true;
-          shaderModuleUsage.rtSystemValueUsage.primitive.primitiveIndex = 1;
-          break;
-        }
-        case BuiltInInstanceId: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.instanceID = 1;
-          break;
-        }
-        case BuiltInLaunchIdKHR: {
-          shaderModuleUsage.rtSystemValueUsage.ray.launchId = 1;
-          break;
-        }
-        case BuiltInLaunchSizeKHR: {
-          shaderModuleUsage.rtSystemValueUsage.ray.launchSize = 1;
-          break;
-        }
-        case BuiltInWorldRayOriginKHR: {
-          shaderModuleUsage.rtSystemValueUsage.ray.worldRayOrigin = 1;
-          break;
-        }
-        case BuiltInWorldRayDirectionKHR: {
-          shaderModuleUsage.rtSystemValueUsage.ray.worldRayDirection = 1;
-          break;
-        }
-        case BuiltInObjectRayOriginKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.objectRayOrigin = 1;
-          break;
-        }
-        case BuiltInObjectRayDirectionKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.objectRayDirection = 1;
-          break;
-        }
-        case BuiltInRayTminKHR: {
-          shaderModuleUsage.rtSystemValueUsage.ray.tMin = 1;
-          break;
-        }
-        case BuiltInInstanceCustomIndexKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.instanceIndex = 1;
-          break;
-        }
-        case BuiltInObjectToWorldKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.objectToWorld = 1;
-          break;
-        }
-        case BuiltInWorldToObjectKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.worldToObject = 1;
+    }
+  }
+
+  if (!hasIndexDecorate)
+    shaderModuleUsage.disableDualSource = true;
+
+  // Set usage relevant to instructions
+  for (unsigned i = 0; i < module->getNumFunctions(); ++i) {
+    auto func = module->getFunction(i);
+    for (unsigned j = 0; j < func->getNumBasicBlock(); ++j) {
+      auto block = func->getBasicBlock(j);
+      for (unsigned k = 0; k < block->getNumInst(); ++k) {
+        auto inst = block->getInst(k);
+        switch (inst->getOpCode()) {
+        case OpExtInst: {
+          auto extInst = static_cast<SPIRVExtInst *>(inst);
+          if (extInst->getExtOp() == GLSLstd450InterpolateAtSample)
+            shaderModuleUsage.useSampleInfo = true;
+          else if (extInst->getExtOp() == GLSLstd450NMin || extInst->getExtOp() == GLSLstd450NMax)
+            shaderModuleUsage.useIsNan = true;
           break;
         }
-        case BuiltInHitTNV: {
-          shaderModuleUsage.rtSystemValueUsage.ray.tCurrent = 1;
+        case OpTraceNV:
+        case OpTraceRayKHR:
+          shaderModuleUsage.hasTraceRay = true;
           break;
-        }
-        case BuiltInHitKindKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.hitKind = 1;
+        case OpExecuteCallableNV:
+        case OpExecuteCallableKHR:
+          shaderModuleUsage.hasExecuteCallable = true;
           break;
-        }
-        case BuiltInHitTriangleVertexPositionsKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.hitTrianglePosition = 1;
+        case OpIsNan:
+          shaderModuleUsage.useIsNan = true;
           break;
-        }
-        case BuiltInIncomingRayFlagsKHR: {
-          shaderModuleUsage.rtSystemValueUsage.ray.flags = 1;
+        case OpAccessChain: {
+          auto accessChain = static_cast<SPIRVAccessChain *>(inst);
+          auto base = accessChain->getBase();
+          auto baseType = base->getType()->getPointerElementType(); // Dereference to base value type
+
+          // NOTE: When any member of gl_PerVertex is used, its other members will be added to SPIR-V in the annotation
+          // section. We are unable to determine their actual usage unless we parse the AccessChain instruction.
+          // This has impacts on Position, PointSize, ClipDistance, and CullDistance.
+          if (base->getType()->getPointerStorageClass() == StorageClassOutput && baseType && baseType->isTypeStruct()) {
+            // We find an output struct variable, further check its member built-in decorations.
+            const auto index = static_cast<SPIRVConstant *>(accessChain->getIndices()[0])->getZExtIntValue();
+            SPIRVWord builtIn = SPIRVWORD_MAX;
+            if (baseType->hasMemberDecorate(index, DecorationBuiltIn, 0, &builtIn)) {
+              switch (builtIn) {
+              case BuiltInPointSize:
+                shaderModuleUsage.usePointSize = true;
+                break;
+              case BuiltInClipDistance:
+              case BuiltInCullDistance:
+                shaderModuleUsage.useGenericBuiltIn = true;
+                break;
+              default:
+                break;
+              }
+            }
+          }
           break;
         }
-        case BuiltInRayGeometryIndexKHR: {
-          shaderModuleUsage.rtSystemValueUsage.primitive.geometryIndex = 1;
+        default:
           break;
         }
-        default: {
-          break;
-        }
-        }
-      } else if (decoration == DecorationIndex) {
-        hasIndexDecoration = true;
-      } else if (decoration == DecorationPerVertexKHR)
-        shaderModuleUsage.useBarycentric = true;
-      break;
-    }
-    case OpSpecConstantTrue:
-    case OpSpecConstantFalse:
-    case OpSpecConstant:
-    case OpSpecConstantComposite:
-    case OpSpecConstantOp: {
-      shaderModuleUsage.useSpecConstant = true;
-      break;
-    }
-    case OpTraceNV:
-    case OpTraceRayKHR: {
-      shaderModuleUsage.hasTraceRay = true;
-      break;
-    }
-    case OpExecuteCallableNV:
-    case OpExecuteCallableKHR:
-      shaderModuleUsage.hasExecuteCallable = true;
-      break;
-    case OpIsNan: {
-      shaderModuleUsage.useIsNan = true;
-      break;
-    }
-    default: {
-      break;
-    }
+      }
     }
-    codePos += wordCount;
   }
 
-  // Without any DecorationIndex, it needs to disableDualSource
-  if (hasIndexDecoration == false)
-    shaderModuleUsage.disableDualSource = true;
+  // Set usage relevant to execution modes
+  for (unsigned i = 0; i < module->getNumFunctions(); ++i) {
+    auto func = module->getFunction(i);
+    if (module->getEntryPoint(func->getId())) {
+      if (func->getExecutionMode(ExecutionModeOriginUpperLeft))
+        shaderModuleUsage.originUpperLeft = true;
+
+      if (func->getExecutionMode(ExecutionModePixelCenterInteger))
+        shaderModuleUsage.pixelCenterInteger = true;
+
+      if (func->getExecutionMode(ExecutionModeXfb))
+        shaderModuleUsage.enableXfb = true;
+    }
+  }
 
-  if (capabilities.find(CapabilityVariablePointersStorageBuffer) != capabilities.end())
+  // Set usage relevant to capabilities
+  if (module->hasCapability(CapabilityVariablePointersStorageBuffer))
     shaderModuleUsage.enableVarPtrStorageBuf = true;
 
-  if (capabilities.find(CapabilityVariablePointers) != capabilities.end())
+  if (module->hasCapability(CapabilityVariablePointers))
     shaderModuleUsage.enableVarPtr = true;
 
-  if (capabilities.find(CapabilityRayQueryKHR) != capabilities.end())
+  if (module->hasCapability(CapabilityRayQueryKHR))
     shaderModuleUsage.enableRayQuery = true;
 
-  if ((!shaderModuleUsage.useSubgroupSize) &&
-          ((capabilities.count(CapabilityGroupNonUniform) > 0) ||
-           (capabilities.count(CapabilityGroupNonUniformVote) > 0) ||
-           (capabilities.count(CapabilityGroupNonUniformArithmetic) > 0) ||
-           (capabilities.count(CapabilityGroupNonUniformBallot) > 0) ||
-           (capabilities.count(CapabilityGroupNonUniformShuffle) > 0) ||
-           (capabilities.count(CapabilityGroupNonUniformShuffleRelative) > 0) ||
-           (capabilities.count(CapabilityGroupNonUniformClustered) > 0) ||
-           (capabilities.count(CapabilityGroupNonUniformQuad) > 0) ||
-           (capabilities.count(CapabilitySubgroupBallotKHR) > 0) ||
-           (capabilities.count(CapabilitySubgroupVoteKHR) > 0) || (capabilities.count(CapabilityGroups) > 0)) ||
-      (capabilities.count(CapabilityGroupNonUniformRotateKHR) > 0)) {
+  if (module->getExtension().count("SPV_AMD_shader_ballot") > 0)
+    shaderModuleUsage.useSubgroupSize = true;
+
+  if (!shaderModuleUsage.useSubgroupSize &&
+      (module->hasCapability(CapabilityGroupNonUniform) || module->hasCapability(CapabilityGroupNonUniformVote) ||
+       module->hasCapability(CapabilityGroupNonUniformArithmetic) ||
+       module->hasCapability(CapabilityGroupNonUniformBallot) ||
+       module->hasCapability(CapabilityGroupNonUniformShuffle) ||
+       module->hasCapability(CapabilityGroupNonUniformShuffleRelative) ||
+       module->hasCapability(CapabilityGroupNonUniformClustered) ||
+       module->hasCapability(CapabilityGroupNonUniformQuad) || module->hasCapability(CapabilitySubgroupBallotKHR) ||
+       module->hasCapability(CapabilitySubgroupVoteKHR) || module->hasCapability(CapabilityGroups) ||
+       module->hasCapability(CapabilityGroupNonUniformRotateKHR))) {
     shaderModuleUsage.useSubgroupSize = true;
   }
 
@@ -565,19 +580,15 @@ Result ShaderModuleHelper::getShaderBinaryType(BinaryData shaderBinary, BinaryTy
 // will point to the data in trimmed code.  It should not be resized or deallocated while moduleData is still needed.
 //
 // @param shaderInfo : Shader module build info
+// @param module : SPIR-V module (valid when the binary type is SPIR-V)
 // @param codeBuffer [out] : A buffer to hold the trimmed code if it is needed.
 // @param moduleData [out] : If successful, the module data for the module.  Undefined if unsuccessful.
 // @return : Success if the data was read.  The appropriate error otherwise.
-Result ShaderModuleHelper::getModuleData(const ShaderModuleBuildInfo *shaderInfo,
+Result ShaderModuleHelper::getModuleData(const ShaderModuleBuildInfo *shaderInfo, SPIRVModule *module,
                                          llvm::MutableArrayRef<unsigned> codeBuffer,
                                          Vkgc::ShaderModuleData &moduleData) {
-  const BinaryData &shaderBinary = shaderInfo->shaderBin;
-  Result result = ShaderModuleHelper::getShaderBinaryType(shaderBinary, moduleData.binType);
-  if (result != Result::Success)
-    return result;
-
   if (moduleData.binType == BinaryType::Spirv) {
-    moduleData.usage = ShaderModuleHelper::getShaderModuleUsageInfo(&shaderBinary);
+    moduleData.usage = ShaderModuleHelper::getShaderModuleUsageInfo(module);
     moduleData.usage.isInternalRtShader = shaderInfo->options.pipelineOptions.internalRtShaders;
     auto codeOrErr = getShaderCode(shaderInfo, codeBuffer);
     if (Error err = codeOrErr.takeError())
@@ -593,8 +604,8 @@ Result ShaderModuleHelper::getModuleData(const ShaderModuleBuildInfo *shaderInfo
                   "Expecting the cacheHash entry in the module data to be the same size as the MetroHash hash!");
     memcpy(moduleData.cacheHash, cacheHash.dwords, sizeof(cacheHash));
   } else {
-    moduleData.binCode = shaderBinary;
-    memcpy(codeBuffer.data(), shaderBinary.pCode, shaderBinary.codeSize);
+    moduleData.binCode = shaderInfo->shaderBin;
+    memcpy(codeBuffer.data(), shaderInfo->shaderBin.pCode, shaderInfo->shaderBin.codeSize);
   }
 
   return Result::Success;
@@ -627,9 +638,11 @@ Expected<BinaryData> ShaderModuleHelper::getShaderCode(const ShaderModuleBuildIn
 }
 
 // =====================================================================================================================
+// Get shader code size. If SPIR-V binary is trimmed, get the new size.
+//
 // @param shaderInfo : Shader module build info
 // @return : The number of bytes need to hold the code for this shader module.
-Expected<unsigned> ShaderModuleHelper::getCodeSize(const ShaderModuleBuildInfo *shaderInfo) {
+Expected<unsigned> ShaderModuleHelper::getShaderCodeSize(const ShaderModuleBuildInfo *shaderInfo) {
   const BinaryData &shaderBinary = shaderInfo->shaderBin;
   BinaryType binaryType;
   Result result = ShaderModuleHelper::getShaderBinaryType(shaderBinary, binaryType);
@@ -638,9 +651,9 @@ Expected<unsigned> ShaderModuleHelper::getCodeSize(const ShaderModuleBuildInfo *
 
   bool trimDebugInfo =
       binaryType != BinaryType::LlvmBc && cl::TrimDebugInfo && !(shaderInfo->options.pipelineOptions.internalRtShaders);
-
   if (!trimDebugInfo)
     return shaderBinary.codeSize;
+
   return ShaderModuleHelper::trimSpirvDebugInfo(&shaderBinary, {});
 }
 
diff --git a/llpc/util/llpcShaderModuleHelper.h b/llpc/util/llpcShaderModuleHelper.h
index 5282f38fd1..f5bb844636 100644
--- a/llpc/util/llpcShaderModuleHelper.h
+++ b/llpc/util/llpcShaderModuleHelper.h
@@ -35,6 +35,12 @@
 #include <llvm/Support/Error.h>
 #include <vector>
 
+namespace SPIRV {
+
+class SPIRVModule;
+
+} // namespace SPIRV
+
 namespace Llpc {
 
 // Represents the information of one shader entry in ShaderModuleData
@@ -55,7 +61,7 @@ struct ShaderEntryName {
 // Represents LLPC shader module helper class
 class ShaderModuleHelper {
 public:
-  static ShaderModuleUsage getShaderModuleUsageInfo(const BinaryData *spvBinCode);
+  static ShaderModuleUsage getShaderModuleUsageInfo(SPIRV::SPIRVModule *module);
 
   static llvm::Expected<unsigned> trimSpirvDebugInfo(const BinaryData *spvBin,
                                                      llvm::MutableArrayRef<unsigned> codeBuffer);
@@ -70,9 +76,9 @@ class ShaderModuleHelper {
 
   static bool isLlvmBitcode(const BinaryData *shaderBin);
   static Result getShaderBinaryType(BinaryData shaderBinary, BinaryType &binaryType);
-  static Result getModuleData(const ShaderModuleBuildInfo *shaderInfo, llvm::MutableArrayRef<unsigned> codeBuffer,
-                              Vkgc::ShaderModuleData &moduleData);
-  static llvm::Expected<unsigned> getCodeSize(const ShaderModuleBuildInfo *shaderInfo);
+  static Result getModuleData(const ShaderModuleBuildInfo *shaderInfo, SPIRV::SPIRVModule *module,
+                              llvm::MutableArrayRef<unsigned> codeBuffer, Vkgc::ShaderModuleData &moduleData);
+  static llvm::Expected<unsigned> getShaderCodeSize(const ShaderModuleBuildInfo *shaderInfo);
   static llvm::Expected<BinaryData> getShaderCode(const ShaderModuleBuildInfo *shaderInfo,
                                                   llvm::MutableArrayRef<unsigned int> &codeBuffer);
 };
diff --git a/llvmraytracing/CMakeLists.txt b/llvmraytracing/CMakeLists.txt
index bc041f02bf..ecec8ad005 100644
--- a/llvmraytracing/CMakeLists.txt
+++ b/llvmraytracing/CMakeLists.txt
@@ -24,7 +24,6 @@ add_llvm_library(LLVMRaytracing
   lib/DXILSupport.cpp
   lib/GpurtContext.cpp
   lib/GpurtDialect.cpp
-  lib/LegacyCleanupContinuations.cpp
   lib/ContinuationsStatsReport.cpp
   lib/LgcCpsDialect.cpp
   lib/LgcCpsJumpInliner.cpp
@@ -38,6 +37,7 @@ add_llvm_library(LLVMRaytracing
   lib/PipelineState.cpp
   lib/PayloadAccessQualifiers.cpp
   lib/RemoveTypesMetadata.cpp
+  lib/SpecializeDriverShaders.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvmraytracing/include/lgc/GpurtDialect.td b/llvmraytracing/include/lgc/GpurtDialect.td
index a632dd16dc..b4a965d9be 100644
--- a/llvmraytracing/include/lgc/GpurtDialect.td
+++ b/llvmraytracing/include/lgc/GpurtDialect.td
@@ -366,12 +366,6 @@ def GpurtInitStaticIdOp : GpurtOp<"init.static.id", [Memory<[]>, WillReturn]> {
   let summary = "Initialize (generate) a ray static ID";
 }
 
-def GpurtContinuationStackIsGlobalOp : GpurtOp<"continuation.stack.is.global", [Memory<[]>, WillReturn]> {
-  let arguments = (ins);
-  let results = (outs I1:$result);
-  let summary = "Check whether continuation stack is global";
-}
-
 def GpurtGetRayQueryDispatchIdOp : GpurtOp<"get.ray.query.dispatch.id", [Memory<[(read InaccessibleMem)]>, WillReturn]> {
   let arguments = (ins);
   let results = (outs V3I32:$dispatchId);
diff --git a/llvmraytracing/include/lgc/LgcCpsDialect.td b/llvmraytracing/include/lgc/LgcCpsDialect.td
index e7d7654e29..c671a0700c 100644
--- a/llvmraytracing/include/lgc/LgcCpsDialect.td
+++ b/llvmraytracing/include/lgc/LgcCpsDialect.td
@@ -43,7 +43,7 @@ def ContinuationReference : TgConstant<(or I32, I64)>, Type;
 
 // =====================================================================================================================
 def JumpOp : LgcCpsOp<"jump", [NoReturn]> {
-    let arguments = (ins ContinuationReference:$target, AttrI32:$levels, value:$state, ContinuationReference:$rcr, varargs:$tail);
+    let arguments = (ins ContinuationReference:$target, AttrI32:$levels, value:$state, I32:$csp, ContinuationReference:$rcr, varargs:$tail);
     let results = (outs);
 
     let summary = "Jump to a CPS function.";
@@ -52,6 +52,7 @@ def JumpOp : LgcCpsOp<"jump", [NoReturn]> {
             * target, the continuation reference
             * levels, a bitmask of levels in which target may run
             * state, which is pushed to the continuation stack before jumping,
+            * csp, continuation stack pointer,
             * rcr, a continuation reference the called function can potentially return to
             * an arbitrary set of arguments appended to the tail of the argument list.
     }];
diff --git a/llvmraytracing/include/llvmraytracing/Continuations.h b/llvmraytracing/include/llvmraytracing/Continuations.h
index 3b94ad0c90..9563b9d154 100644
--- a/llvmraytracing/include/llvmraytracing/Continuations.h
+++ b/llvmraytracing/include/llvmraytracing/Continuations.h
@@ -112,9 +112,6 @@ Function *getSetLocalRootIndex(Module &M);
 /// Get intrinsic to convert a dx handle to an acceleration struct address.
 Function *getAccelStructAddr(Module &M, Type *HandleTy);
 
-/// Get the await intrinsic.
-Function *getContinuationAwait(Module &M, Type *TokenTy, StructType *RetTy);
-
 /// Get function that returns the global memory base address if the continuation
 /// stack lives in global memory.
 Function *getContinuationStackGlobalMemBase(Module &M);
@@ -149,7 +146,7 @@ void terminateShader(IRBuilder<> &Builder, CallInst *CompleteCall);
 ///
 /// Replace intrinsics called by gpurt code that can be replaced early.
 /// Returns whether something changed.
-bool earlyDriverTransform(Module &M);
+bool earlyGpurtTransform(Module &M);
 
 /// Given a number NumI32s of 4-byte values and the number of reserved
 /// registers, return the amount of dynamic storage required to store that many
@@ -161,14 +158,6 @@ uint64_t computePayloadSpillSize(uint64_t NumI32s, uint64_t NumReservedRegisters
 // of individual bytes at the end if NumBytes is not a multiple of 4.
 void copyBytes(IRBuilder<> &B, Value *Dst, Value *Src, uint64_t NumBytes);
 
-class LegacyCleanupContinuationsPass : public llvm::PassInfoMixin<LegacyCleanupContinuationsPass> {
-public:
-  LegacyCleanupContinuationsPass() {}
-  llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager);
-
-  static llvm::StringRef name() { return "legacy continuation cleanup"; }
-};
-
 class CleanupContinuationsPass : public llvm::PassInfoMixin<CleanupContinuationsPass> {
 public:
   CleanupContinuationsPass(bool Use64BitContinuationReferences = false)
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
index f134257833..e40951662a 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
+++ b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
@@ -116,7 +116,7 @@ enum class AnyHitExitKind {
 };
 
 // The address space used for the continuation stack.
-enum class ContStackAddrspace : uint32_t { Scratch = 21, Global = 22 };
+enum class ContStackAddrspace : uint32_t { GlobalLLPC = 1, ScratchLLPC = 5, Scratch = 21, Global = 22 };
 
 struct ContSetting {
   /// A hash value that is used as name.
@@ -205,9 +205,6 @@ class ContHelper {
   // The raytracing ip level that is available on the target architecture.
   // This is exposed to gpurt code via the GetRtip intrinsic.
   static constexpr const char *MDRtipName = "continuation.rtip";
-  // Flags set for continuations.
-  // This is exposed to gpurt code via the ContinuationsGetFlags intrinsic.
-  static constexpr const char *MDFlagsName = "continuation.flags";
 
   static std::optional<uint32_t> extractZExtI32Constant(MDNode *Node) {
     if (Node) {
@@ -366,7 +363,6 @@ class ContHelper {
   MODULE_METADATA_HELPER(MaxUsedPayloadRegisterCount, MDMaxUsedPayloadRegisterCountName)
   MODULE_METADATA_HELPER(MaxPayloadRegisterCount, MDMaxPayloadRegisterCountName)
   MODULE_METADATA_HELPER(Rtip, MDRtipName)
-  MODULE_METADATA_HELPER(Flags, MDFlagsName)
 
 #undef MODULE_METADATA_HELPER
 
@@ -380,7 +376,9 @@ class ContHelper {
     if (!AddrSpace)
       return {};
     assert((*AddrSpace == static_cast<uint32_t>(ContStackAddrspace::Scratch) ||
-            *AddrSpace == static_cast<uint32_t>(ContStackAddrspace::Global)) &&
+            *AddrSpace == static_cast<uint32_t>(ContStackAddrspace::Global) ||
+            *AddrSpace == static_cast<uint32_t>(ContStackAddrspace::ScratchLLPC) ||
+            *AddrSpace == static_cast<uint32_t>(ContStackAddrspace::GlobalLLPC)) &&
            "Unexpected continuation stack address space");
     return static_cast<ContStackAddrspace>(*AddrSpace);
   };
@@ -413,16 +411,10 @@ class ContHelper {
     return MDNode::get(T->getContext(), {ConstantAsMetadata::get(PoisonValue::get(T))});
   }
 
-  static std::optional<int32_t> tryGetWaitMask(const CallInst &CI) {
-    return extractZExtI32Constant(CI.getMetadata(MDWaitMaskName));
-  }
-
-  static void setWaitMask(CallInst &CI, int32_t WaitMask) {
-    CI.setMetadata(MDWaitMaskName, getI32MDConstant(CI.getContext(), WaitMask));
-  }
+  static void setWaitMask(CallInst &CI) { CI.setMetadata(MDWaitMaskName, MDTuple::get(CI.getContext(), {})); }
 
   // Queries whether an awaited call should wait on a wait mask.
-  static bool isWaitAwaitCall(const CallInst &CI) { return CI.getMetadata(MDWaitMaskName) != nullptr; }
+  static bool isWaitAwaitCall(const CallInst &CI) { return CI.hasMetadata(MDWaitMaskName); }
 
   static void removeWaitMask(CallInst &CI) { CI.setMetadata(MDWaitMaskName, nullptr); }
 
@@ -557,8 +549,6 @@ DRIVER_FUNC_NAME(ShaderStart)
 // Removes the original call.
 void replaceCallsToFunction(llvm::Function &F, llvm::Value &Replacement);
 
-bool isLgcRtOp(const llvm::Function *F);
-
 // Move all basic blocks of OldFunc to NewFunc.
 void moveFunctionBody(Function &OldFunc, Function &NewFunc);
 
diff --git a/llvmraytracing/include/llvmraytracing/PipelineState.h b/llvmraytracing/include/llvmraytracing/PipelineState.h
index f00f335ac3..ba0a2fc570 100644
--- a/llvmraytracing/include/llvmraytracing/PipelineState.h
+++ b/llvmraytracing/include/llvmraytracing/PipelineState.h
@@ -52,6 +52,7 @@
  */
 #pragma once
 
+#include "llvmraytracing/SpecializeDriverShaders.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 
@@ -88,6 +89,7 @@ class PipelineState {
   // The maximum occurring number of payload registers in the pipeline, which will be taken into account for Traversal
   // module so that it sees the correct maximum payload size of a pipeline.
   unsigned MaxUsedPayloadRegisterCount = 0;
+  llvm::SpecializeDriverShadersState SDSState;
 };
 
 } // namespace llvmraytracing
diff --git a/llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h b/llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h
new file mode 100644
index 0000000000..04a62ba405
--- /dev/null
+++ b/llvmraytracing/include/llvmraytracing/SpecializeDriverShaders.h
@@ -0,0 +1,177 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- SpecializeDriverShaders.h ----------------------------------------------------------------------------------===//
+//
+// This file declares a pass to specialize arguments of driver functions (e.g. Traversal) for known constants using
+// full pipeline knowledge.
+//
+//===--------------------------------------------------------------------------------------------------------------===//
+
+// This pass specializes driver shaders (e.g. the Traversal shader), propagating common known arguments into them.
+// For now this only specializes the Traversal shader, but we could later extend it, e.g. for a dedicated Sort shader.
+//
+// For Traversal, we partition CPS functions into in-Traversal and out-of-Traversal functions.
+// In-Traversal functions are Traversal itself, AHS and Intersection functions, including Intersection resume functions.
+//
+// We analyze all jumps to functions that might be in-Traversal, decompose passed arguments into dword-sized argument
+// slots, and determine which argument slots are constant.
+// For in-Traversal functions, we additionally analyze which argument slots are not constant, but preserved.
+// We combine this information to prove that certain argument slots always have a specific constant value when entering
+// the Traversal shader, and specialize Traversal accordingly.
+//
+// Although this optimization requires full-pipeline knowledge, it can also be applied for non-standalone pipelines,
+// because we compile driver functions per pipeline after having processed all contained app shaders.
+//
+// This mostly aims at optimizing the common pattern of using the payload to pass information from CHS/Miss back to
+// RayGen, and leaving the payload uninitialized or zero-initialized during Traversal. However, it also covers
+// some common cases of constant TraceRay arguments, e.g. tMin and tMax.
+//
+// The analysis and specialization part is done by the same pass. We use metadata to store analysis results on app
+// shaders, and rely on the pipeline compiler to merge the analysis results across modules accordingly.
+// The necessary state is maintained by SpecializeDriverShadersState, which is part of llvmraytracing::PipelineState.
+//
+// As the analysis and optimization relies on specializing constant argument slots, and as we rely on type punning
+// to e.g. pass compatible prefixes of structs, we have to make some assumptions on the calling convention in order
+// to determine which values end up in which argument slots.
+//
+// For that, we assume that aggregate types and vector types are recursively decomposed into scalars, and that the
+// scalars are passed in consecutive argument slots without any padding, covering multiple arg slots for large scalars.
+// We assume that there is no packing of separate small scalars (e.g. 16-bit) into single registers / argument slots.
+// This is the same assumption that is also used in LowerRaytracingPipeline when determining argument padding.
+//
+// We can only analyze argument slots that correspond to a full, aligned dword in the in-memory representation of a
+// type, because our value analysis works on dword slices on the in-memory representation.
+// Other argument slots are conservatively treated as unknown / dynamic.
+// For instance, this excludes i16 scalars, and misaligned i32 scalars (e.g. as part of a packed struct).
+// As of this writing, we don't use such arguments.
+//
+// All of this even works if the data layout (DL) requires padding in passed types, where there is no longer a 1:1
+// correspondence between the dwords in the in-memory layout of args, and the in-register representation.
+// This is achieved by maintaining a mapping between the in-memory representation of a type, which is the basis
+// for our value origin analysis, and the in-register representation.
+// For instance, if i64 is 64-bit aligned, then the type {i32, i64} has a single padding dword in memory, but not as
+// in-registers argument.
+// A shader that receives such a type, and passes the contained i32 and i64 values as separate arguments to the next
+// one is considered to preserve these three argument slots.
+//
+// We rely on being able to replace undef and poison values by arbitrary constants. For instance, if all TraceRay
+// call sites pass in an undef value in a particular argument slot, and the only other shader that does not preserve
+// this argument slot instead passes a constant C, then we assume this argument slot to always equal C.
+// This may break apps that incorrectly rely on implicit zero-initialization.
+// If this becomes an issue, we can make undef/poison behavior configurable, and e.g. treat it as constant zero instead.
+//
+//===--------------------------------------------------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/IR/PassManager.h"
+#include <memory>
+
+namespace llvm {
+
+namespace msgpack {
+class DocNode;
+} // namespace msgpack
+
+// Options for SpecializeDriverShadersPass.
+// Defined out of class to work around issue with a default-initialized argument.
+struct SpecializeDriverShadersOptions {
+  // If set, only analysis is done, but not function specialization.
+  // Skipping the pass can be potentially unsafe unless one can guarantee to skip it on
+  // all modules of a pipeline, including parent pipelines. Otherwise, running the pass
+  // on some but not all modules might lead to incorrect specializations.
+  bool DisableSpecialization = false;
+  // Disable analysis of functions in a module. Allows testing specializations of multiple functions in the same module.
+  bool DisableAnalysis = false;
+
+  bool operator==(SpecializeDriverShadersOptions const &Other) const {
+    return std::tie(DisableSpecialization, DisableAnalysis) ==
+           std::tie(Other.DisableSpecialization, Other.DisableAnalysis);
+  }
+
+  bool operator!=(SpecializeDriverShadersOptions const &Other) const { return !(*this == Other); }
+
+  void exportModuleMetadata(llvm::Module &M) const;
+  static llvm::Expected<SpecializeDriverShadersOptions> fromModuleMetadata(const llvm::Module &M);
+};
+
+class SpecializeDriverShadersPass : public llvm::PassInfoMixin<SpecializeDriverShadersPass> {
+public:
+  llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager);
+  static llvm::StringRef name() { return "Specialize driver functions"; }
+};
+
+// The whole-pipeline state needed by SpecializeDriverShadersPass to optimize driver functions.
+// This implements the interface required by llvmraytracing::PipelineState for serialization.
+// Serialization order on app modules is:
+//  1. Start with app module without metadata
+//  2. Pass runs on module, tries to import from MD, there is none, so start with trivial state
+//  3. At the end of the pass, serialize to MD
+//  4. At the end of the llvmraytracing pipeline, llvmraytracing::PipelineState deserializes from MD
+//  5. The pipeline compiler may merge with the deserialized state from other modules
+//  6. The combined state is serialized to the GpuRt module
+//  7. The pass runs on the GpuRt module, deserializes the combined pipeline state, and specializes
+//     shaders according to that state.
+//
+// In case of separately compiled libraries or parent pipelines, at the end the combined
+// state is serialized to MsgPack, stored as a blob, imported from MsgPack for the child pipeline,
+// and combined with the child pipeline's app shader states.
+//
+// We use the pImpl (pointer to implementation) pattern to prevent exposing implementation details in the header.
+class SpecializeDriverShadersState {
+public:
+  using Self = SpecializeDriverShadersState;
+  SpecializeDriverShadersState();
+  SpecializeDriverShadersState(const Self &Other);
+  SpecializeDriverShadersState(Self &&);
+  // User-declared default destructor to avoid header dependency on ~Impl(), as Impl is forward declared only.
+  ~SpecializeDriverShadersState() noexcept;
+
+  SpecializeDriverShadersState &operator=(const SpecializeDriverShadersState &Other);
+  SpecializeDriverShadersState &operator=(SpecializeDriverShadersState &&Other);
+
+  static llvm::Expected<Self> decodeMsgpack(llvm::msgpack::DocNode &Node);
+  void encodeMsgpack(llvm::msgpack::DocNode &Node) const;
+
+  // In case no module metadata is found, e.g. because the SpecializeDriverShadersPass did not run
+  // on the module, we return a valid, trivial state object.
+  // Errors are only returned in case there is metadata, but using an unexpected format.
+  // We only apply the Traversal specialization in case there is an existing nontrivial state,
+  // to prevent miscompiles in case the cross-module state merging is not performed.
+  static llvm::Expected<Self> fromModuleMetadata(const llvm::Module &M);
+  void exportModuleMetadata(llvm::Module &M) const;
+
+  void merge(const Self &Other);
+
+private:
+  friend class SpecializeDriverShadersPass;
+
+  struct Impl;
+  SpecializeDriverShadersState(std::unique_ptr<Impl>);
+  std::unique_ptr<Impl> Pimpl;
+};
+
+} // namespace llvm
diff --git a/llvmraytracing/lib/CleanupContinuations.cpp b/llvmraytracing/lib/CleanupContinuations.cpp
index 0a6639b278..73c3691001 100644
--- a/llvmraytracing/lib/CleanupContinuations.cpp
+++ b/llvmraytracing/lib/CleanupContinuations.cpp
@@ -62,10 +62,12 @@
 #include "llvmraytracing/GpurtContext.h"
 #include "lgc/LgcCpsDialect.h"
 #include "lgc/LgcIlCpsDialect.h"
+#include "lgc/LgcRtDialect.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -89,11 +91,13 @@ class CleanupContinuationsPassImpl {
     /// All functions belonging to this continuation, the entry function is the
     /// first one
     SmallVector<Function *> Functions;
+    SmallVector<Function *> NewFunctions;
+    SmallVector<CallInst *> CpsIntrinsicCalls;
     /// Size of the continuation state in byte
+    bool IsStart = true;
     uint32_t ContStateBytes = 0;
     CallInst *MallocCall = nullptr;
     MDNode *MD = nullptr;
-    SmallVector<Function *> NewFunctions;
   };
 
   void removeContFreeCall(Function *F, Function *ContFree);
@@ -105,7 +109,7 @@ class CleanupContinuationsPassImpl {
   void processContinuations();
   void handleContinue(ContinuationData &Data, Instruction *Ret);
   void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun);
-  void lowerIntrinsicCall(Module &Mod);
+  void lowerIntrinsicCall(Function *F, ContinuationData &Data);
   void lowerGetResumePoint(Module &Mod);
   bool lowerCompleteOp(Module &Mod);
 
@@ -173,11 +177,21 @@ findTokenOrigin(BasicBlock *BB, Value *V, SmallVectorImpl<Instruction *> &ToRemo
       assert(ResumeFunEntry && "Need a resume fun for each call");
       assert(isa<Constant>(ResumeFunEntry) && "Resume function should be a constant function");
 
-      assert(isa<CallInst>(std::get<1>(CallEntry)) && "Phi should come from a call");
-      Result.insert(std::make_pair(PhiBB, std::make_pair(cast<CallInst>(std::get<1>(CallEntry)), ResumeFunEntry)));
+      Value *CInst = std::get<1>(CallEntry);
+
+      // Strip away bitcasts -- this can happen with multiple token types
+      if (auto *TokenBitcast = dyn_cast<BitCastOperator>(CInst))
+        CInst = TokenBitcast->getOperand(0);
+
+      assert(isa<CallInst>(CInst) && "Phi should come from a call");
+
+      Result.insert(std::make_pair(PhiBB, std::make_pair(cast<CallInst>(CInst), ResumeFunEntry)));
     }
   } else {
     assert(isa<Constant>(ResumeFun) && "Resume function should be a constant function");
+    // Strip away bitcasts -- this can happen with multiple token types
+    if (auto *TokenBitcast = dyn_cast<BitCastOperator>(Call))
+      Call = TokenBitcast->getOperand(0);
     assert(isa<CallInst>(Call) && "Call should be a CallInst");
     auto *CallI = cast<CallInst>(Call);
     Result.insert(std::make_pair(BB, std::make_pair(CallI, ResumeFun)));
@@ -227,35 +241,40 @@ void CleanupContinuationsPassImpl::updateCpsStack(Function *F, Function *NewFunc
   if (IsStart) {
     CpsStack = Builder->create<cps::AllocOp>(Builder->getInt32(CpsInfo.ContStateBytes));
     CpsStack->setName("cont.state.stack.segment");
+    ContHelper::StackSize::setValue(NewFunc, CpsInfo.ContStateBytes);
   } else {
-    // We don't expect stack size metadata on resume functions.
-    ContHelper::StackSize::reset(NewFunc);
     CpsStack = Builder->create<cps::PeekOp>(Builder->getInt32(CpsInfo.ContStateBytes));
   }
 
   SmallVector<Instruction *> ToBeRemoved;
-  Value *OldBase = getContinuationFramePtr(F, IsStart, CpsInfo, &ToBeRemoved);
-  CompilerUtils::replaceAllPointerUses(Builder, OldBase, CpsStack, ToBeRemoved);
+  Value *ContFrame = getContinuationFramePtr(F, IsStart, CpsInfo, &ToBeRemoved);
+
+  if (CpsInfo.ContStateBytes != 0) {
+    CompilerUtils::replaceAllPointerUses(Builder, ContFrame, CpsStack, ToBeRemoved);
+  } else {
+    // If there is no continuation state, replace it with a poison
+    // value instead of a zero-sized stack allocation.
+    // This leads to nicer tests.
+    ContFrame->replaceAllUsesWith(PoisonValue::get(ContFrame->getType()));
+  }
 
   for (auto *I : reverse(ToBeRemoved))
     I->eraseFromParent();
 }
 
-static void updateCpsFunctionArgs(Function *OldFunc, Function *NewFunc, const SmallVector<Value *> &AllArgValues) {
+static void updateFunctionArgs(Function *OldFunc, Function *NewFunc, const SmallVector<Value *> &AllArgValues) {
   // Set arg names for new function
-  for (unsigned Idx = 0; Idx != NewFunc->getFunctionType()->params().size(); ++Idx) {
-    Argument *Arg = NewFunc->getArg(Idx);
-    Value *OldVal = AllArgValues[Idx];
+  for (auto [OldVal, NewArg] : llvm::zip_equal(AllArgValues, NewFunc->args())) {
     if (OldVal) {
-      Arg->setName(OldVal->getName());
-      OldVal->replaceAllUsesWith(Arg);
+      NewArg.setName(OldVal->getName());
+      OldVal->replaceAllUsesWith(&NewArg);
     }
   }
 }
 
-static void buildCpsArgInfos(Function *F, bool IsStart, SmallVector<Type *> &AllArgTypes,
-                             SmallVector<Value *> &AllArgValues, SmallVector<AttributeSet> &ParamAttrs,
-                             SmallVector<Instruction *> &InstsToRemove) {
+static void buildArgInfos(Function *F, bool IsStart, SmallVector<Type *> &AllArgTypes,
+                          SmallVector<Value *> &AllArgValues, SmallVector<AttributeSet> &ParamAttrs,
+                          SmallVector<Instruction *> &InstsToRemove) {
 
   auto &Context = F->getContext();
   AttributeList FAttrs = F->getAttributes();
@@ -271,15 +290,21 @@ static void buildCpsArgInfos(Function *F, bool IsStart, SmallVector<Type *> &All
       ArgNo++;
     }
   } else {
-    //  Add extra arguments ({} %state, i32 %rcr, i32 %shader-index) for resume
-    //  part. But for now, we always use continuation stack to pass continuation
-    //  state.
-    AllArgTypes.push_back(StructType::get(Context, {}));
-    AllArgValues.push_back(nullptr);
-    AllArgTypes.push_back(IntegerType::get(Context, 32));
-    AllArgValues.push_back(nullptr);
-    AllArgTypes.push_back(IntegerType::get(Context, 32));
-    AllArgValues.push_back(nullptr);
+    if (lgc::cps::isCpsFunction(*F)) {
+      //  Add extra arguments ({} %state, i32 %rcr, i32 %shader-index) for resume
+      //  part. But for now, we always use continuation stack to pass continuation
+      //  state.
+      Type *I32 = Type::getInt32Ty(Context);
+      AllArgTypes.push_back(StructType::get(Context, {}));
+      AllArgValues.push_back(nullptr);
+      AllArgTypes.push_back(I32);
+      AllArgValues.push_back(nullptr);
+      AllArgTypes.push_back(I32);
+      AllArgValues.push_back(nullptr);
+    } else {
+      AllArgTypes.push_back(Type::getInt64Ty(Context)); // Dummy return address for resume functions
+      AllArgValues.push_back(nullptr);
+    }
 
     // Find arguments from lgc.ilcps.getreturnvalue calls
     for (auto &I : F->getEntryBlock()) {
@@ -349,24 +374,58 @@ void CleanupContinuationsPassImpl::freeCpsStack(Function *F, ContinuationData &C
 bool CleanupContinuationsPassImpl::lowerCompleteOp(Module &Mod) {
   struct VisitState {
     llvm_dialects::Builder *Builder;
-    bool completeLowered;
+    bool CompleteLowered;
   };
 
-  bool completeLowered = false;
-  VisitState State = {Builder, completeLowered};
+  VisitState State = {Builder, false};
   static auto Visitor = llvm_dialects::VisitorBuilder<VisitState>()
-                            .add<cps::CompleteOp>([](VisitState &State, auto &complete) {
-                              State.Builder->SetInsertPoint(&complete);
+                            .add<cps::CompleteOp>([](VisitState &State, auto &Complete) {
+                              State.Builder->SetInsertPoint(&Complete);
                               State.Builder->CreateRetVoid();
-                              BasicBlock *block = complete.getParent();
-                              block->getTerminator()->eraseFromParent();
-                              complete.eraseFromParent();
-                              State.completeLowered = true;
+                              BasicBlock *BB = Complete.getParent();
+                              BB->getTerminator()->eraseFromParent();
+                              Complete.eraseFromParent();
+                              State.CompleteLowered = true;
                             })
                             .build();
 
   Visitor.visit(State, Mod);
-  return State.completeLowered;
+  return State.CompleteLowered;
+}
+
+// For a resume function, find the continue call to it (by looking at its uses)
+// and obtain the incoming payload register count into the resume function
+// as the outgoing register count of the continue call, indicated by metadata.
+uint32_t getIncomingRegisterCount(Function *ResumeFunc) {
+  // For non-start functions, set (incoming) continuation registercount
+  // metadata by looking at the continue calls that reference this
+  // function. These continue calls both specify the number of their
+  // outgoing registers, and the number of incoming payload registers
+  // coming back into the resume function (i.e. us).
+  SmallVector<User *> Worklist(ResumeFunc->users());
+  std::optional<uint32_t> RegCount;
+  while (!Worklist.empty()) {
+    auto *U = Worklist.pop_back_val();
+    if (isa<Constant>(U) || isa<lgc::cps::AsContinuationReferenceOp>(U)) {
+      Worklist.append(U->user_begin(), U->user_end());
+      continue;
+    }
+    assert(isa<CallInst>(U) && "User of a resume function should be a call to continue");
+    auto *Inst = cast<CallInst>(U);
+    if (auto Count = ContHelper::ReturnedRegisterCount::tryGetValue(Inst)) {
+      assert((!RegCount || *RegCount == *Count) && "Got different returned registercounts in continues to "
+                                                   "the same resume function");
+      RegCount = *Count;
+#ifdef NDEBUG
+      break;
+#endif
+    } else {
+      LLVM_DEBUG(Inst->dump());
+      report_fatal_error("Found a jump call without "
+                         "continuation returned registercount metadata");
+    }
+  }
+  return RegCount.value();
 }
 
 void CleanupContinuationsPassImpl::processContinuations() {
@@ -376,13 +435,19 @@ void CleanupContinuationsPassImpl::processContinuations() {
   //    b.) change the address space for cps stack to 32.
   // 2. prepare arguments passed to cps.jump and insert the call at the exit of
   //    start part.
-  // 3. Edit resume signature to add the state/rcr/shader-indxe/returnvalues.
+  // 3. Edit resume signature to add the state/rcr/shader-index/returnvalues.
+  SmallVector<Function *> ToErase;
   for (auto &FuncData : ToProcess) {
     LLVM_DEBUG(dbgs() << "Processing function: " << FuncData.first->getName() << "\n");
     for (auto *F : FuncData.second.Functions) {
-      // Set same linkage as for start function
-      if (F != FuncData.first)
+      if (F != FuncData.first) {
+        // Set same linkage as for start function
         F->setLinkage(FuncData.first->getLinkage());
+        // Entry marker should only be on the start and not on resume functions
+        F->eraseMetadata(F->getContext().getMDKindID(ContHelper::MDEntryName));
+        // Same for stacksize
+        ContHelper::StackSize::reset(F);
+      }
 
       // Ignore the stub created for the coroutine passes
       if (F->empty())
@@ -392,18 +457,13 @@ void CleanupContinuationsPassImpl::processContinuations() {
 
       // If this is the continuation start
       bool IsStart = F == FuncData.first;
-      // We don't need to touch resume part of non-cps function, this usually
-      // should be entry-point compute kernel. The resume part will be erased
-      // at the end.
-      if (!IsStart && !cps::isCpsFunction(*F))
-        continue;
 
       SmallVector<Type *> AllArgTypes;
       SmallVector<Value *> AllArgValues;
       SmallVector<AttributeSet> ParamAttrs;
       SmallVector<Instruction *> InstsToRemove;
 
-      buildCpsArgInfos(F, IsStart, AllArgTypes, AllArgValues, ParamAttrs, InstsToRemove);
+      buildArgInfos(F, IsStart, AllArgTypes, AllArgValues, ParamAttrs, InstsToRemove);
 
       if (ContFree)
         removeContFreeCall(F, ContFree);
@@ -415,16 +475,31 @@ void CleanupContinuationsPassImpl::processContinuations() {
       auto *NewFuncTy = FunctionType::get(Type::getVoidTy(Context), AllArgTypes, false);
       Function *NewFunc = CompilerUtils::cloneFunctionHeader(*F, NewFuncTy, ParamAttrs);
       NewFunc->takeName(F);
+
+      ToErase.push_back(F);
       FuncData.second.NewFunctions.push_back(NewFunc);
 
       // Transfer code from old function to new function
       llvm::moveFunctionBody(*F, *NewFunc);
 
       auto &CpsInfo = FuncData.second;
+
+      // Add function metadata that stores how big the continuation state is in bytes.
+      // Technically, continuation state includes the spilled payload here.
+      // However, we want to exclude it here for statistics.
+      // TODO: Remove this once we can properly report payload size statistics in LowerRaytracingPipeline.
+      if (IsStart) {
+        const uint32_t PayloadSpillSize = ContHelper::StackSize::tryGetValue(NewFunc).value_or(0);
+        assert(CpsInfo.ContStateBytes >= PayloadSpillSize);
+        ContHelper::ContinuationStateByteCount::setValue(NewFunc, CpsInfo.ContStateBytes - PayloadSpillSize);
+      }
+
+      CpsInfo.IsStart = IsStart;
+
       if (CpsInfo.ContStateBytes)
         updateCpsStack(F, NewFunc, IsStart, CpsInfo);
 
-      updateCpsFunctionArgs(F, NewFunc, AllArgValues);
+      updateFunctionArgs(F, NewFunc, AllArgValues);
 
       freeCpsStack(NewFunc, CpsInfo);
       // Handle the function returns
@@ -443,17 +518,40 @@ void CleanupContinuationsPassImpl::processContinuations() {
       // Update the `ToProcess` for later processing.
       if (IsStart)
         FuncData.first = NewFunc;
+
+      // Record lgc.rt intrinsic function calls.
+      for (auto &IntrinsicFunc : Mod.functions()) {
+        if (!lgc::rt::LgcRtDialect::isDialectOp(IntrinsicFunc))
+          continue;
+
+        llvm::forEachCall(IntrinsicFunc, [&](CallInst &CInst) {
+          auto *Caller = CInst.getFunction();
+          if (Caller != NewFunc)
+            return;
+
+          auto IntrImplEntry = llvm::findIntrImplEntryByIntrinsicCall(&CInst);
+          if (IntrImplEntry == std::nullopt)
+            return;
+
+          CpsInfo.CpsIntrinsicCalls.push_back(&CInst);
+        });
+      }
+
+      // Lower lgc.rt intrinsics
+      lowerIntrinsicCall(NewFunc, CpsInfo);
     }
-  }
 
-  // Remove the old functions
-  for (auto &FuncData : ToProcess) {
-    if (FuncData.second.Functions.size() > 1) {
-      // Only for functions that were split
-      for (auto *F : FuncData.second.Functions)
-        F->eraseFromParent();
+    for (Function *F : FuncData.second.NewFunctions) {
+      if (FuncData.first != F) {
+        uint32_t IncomingRegisterCount = getIncomingRegisterCount(F);
+        ContHelper::IncomingRegisterCount::setValue(F, IncomingRegisterCount);
+      }
     }
   }
+
+  // Remove the old functions
+  for (Function *F : ToErase)
+    F->eraseFromParent();
 }
 
 /// Transform
@@ -501,28 +599,33 @@ void CleanupContinuationsPassImpl::handleSingleContinue(ContinuationData &Data,
   Builder->SetInsertPoint(Call);
 
   SmallVector<Value *> TailArgs;
-  uint32_t SkipCount = 2;
   Value *ResumeAddr = nullptr;
-  const bool IsWait = ContHelper::isWaitAwaitCall(*Call);
-  // WaitMask and %rcr (aka. return continuation reference) for the callee.
-  if (cps::isCpsFunction(*cast<Function>(ResumeFun))) {
+  Value *CR = nullptr;
+  unsigned LevelImm = -1;
+
+  uint32_t SkipCount = 2;
+  if (ContHelper::isLgcCpsModule(*Call->getModule()))
+    SkipCount = ContHelper::isWaitAwaitCall(*Call) ? 3 : 2;
+
+  if (lgc::rt::getLgcRtShaderStage(Call->getFunction()) != lgc::rt::RayTracingShaderStage::KernelEntry) {
     ResumeAddr = Builder->create<cps::AsContinuationReferenceOp>(ContinuationReferenceType, ResumeFun);
-    if (IsWait)
-      SkipCount = 3;
   } else {
     // For entry-point compute kernel, pass a poison %rcr.
-    ResumeAddr = PoisonValue::get(Builder->getInt32Ty());
+    ResumeAddr = PoisonValue::get(ContinuationReferenceType);
   }
-  // Skip continuation.reference, levels and potentially the wait mask.
+
+  CR = Call->getArgOperand(0);
   TailArgs.append(SmallVector<Value *>(drop_begin(Call->args(), SkipCount)));
-  auto *CR = Call->getArgOperand(0);
 
-  Value *Level = Call->getArgOperand(IsWait ? 2 : 1);
-  unsigned LevelImm = cast<ConstantInt>(Level)->getZExtValue();
+  if (lgc::cps::isCpsFunction(*Call->getFunction())) {
+    Value *Level = Call->getArgOperand(SkipCount - 1);
+    LevelImm = cast<ConstantInt>(Level)->getZExtValue();
+  }
 
   // TODO: Continuation state is passed through stack for now.
   auto *State = PoisonValue::get(StructType::get(Builder->getContext(), {}));
-  auto *JumpCall = Builder->create<cps::JumpOp>(CR, LevelImm, State, ResumeAddr, TailArgs);
+  auto *Csp = PoisonValue::get(Builder->getInt32Ty());
+  auto *JumpCall = Builder->create<cps::JumpOp>(CR, LevelImm, State, Csp, ResumeAddr, TailArgs);
   // Replace this instruction with a call to cps.jump.
   JumpCall->copyMetadata(*Call);
 
@@ -537,45 +640,41 @@ void CleanupContinuationsPassImpl::handleSingleContinue(ContinuationData &Data,
 }
 
 /// Lower lgc.rt calls inside cps functions.
-void CleanupContinuationsPassImpl::lowerIntrinsicCall(Module &Mod) {
-  DenseMap<Function *, SmallVector<CallInst *>> CpsIntrinsicCalls;
+void CleanupContinuationsPassImpl::lowerIntrinsicCall(Function *F, ContinuationData &Data) {
+  if (Data.CpsIntrinsicCalls.empty())
+    return;
 
-  // We only care about lgc.rt here.
-  for (auto &F : Mod.functions()) {
-    auto Name = F.getName();
-    if (!Name.starts_with("lgc.rt"))
-      continue;
+  auto Stage = lgc::rt::getLgcRtShaderStage(F);
+  if (!Stage)
+    return;
 
-    llvm::forEachCall(F, [&](CallInst &CInst) {
-      auto IntrImplEntry = llvm::findIntrImplEntryByIntrinsicCall(&CInst);
-      if (IntrImplEntry == std::nullopt)
-        return;
+  CompilerUtils::CrossModuleInliner CrossInliner;
+  // Signature of cps function: { state, rcr, shader-index, system-data}
+  const uint32_t SystemDataArgIdx = lgc::cps::isCpsFunction(*F) ? CpsArgIdxSystemData : 1;
 
-      auto *Caller = CInst.getFunction();
-      CpsIntrinsicCalls[Caller].push_back(&CInst);
-    });
-  }
+  Value *SystemDataArg = F->getArg(SystemDataArgIdx);
+  Type *SystemDataTy = SystemDataArg->getType();
+  // Extract the original system data from the { systemData, padding, payload }
+  // struct returned by await.
+  if (!Data.IsStart)
+    SystemDataTy = SystemDataTy->getStructElementType(0);
 
-  CompilerUtils::CrossModuleInliner CrossInliner;
-  for (const auto &[Caller, IntrinsicCalls] : CpsIntrinsicCalls) {
-    // No need to insert system data alloca if no intrinsic call.
-    if (IntrinsicCalls.empty())
-      continue;
+  Builder->SetInsertPointPastAllocas(F);
+  auto *SystemData = Builder->CreateAlloca(SystemDataTy);
 
-    auto Stage = lgc::rt::getLgcRtShaderStage(Caller);
-    if (!Stage)
-      continue;
+  SystemData->setName("system.data.alloca");
 
-    // Signature of cps function: { state, rcr, shader-index, system-data}
-    auto *SystemDataArg = Caller->getArg(CpsArgIdxSystemData);
-    assert(SystemDataArg->getType()->isStructTy() && "SystemData should be struct type");
-    auto *AllocaInsertPt = &*Caller->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
-    Builder->SetInsertPoint(AllocaInsertPt);
-    auto *SystemData = Builder->CreateAlloca(SystemDataArg->getType());
-    Builder->CreateStore(SystemDataArg, SystemData);
-    for (auto *Call : IntrinsicCalls)
-      replaceIntrinsicCall(*Builder, SystemDataArg->getType(), SystemData, *Stage, Call,
-                           GpurtLibrary ? GpurtLibrary : &Mod, CrossInliner);
+  if (!Data.IsStart)
+    SystemDataArg = Builder->CreateExtractValue(SystemDataArg, 0);
+
+  assert(SystemDataArg->getType()->isStructTy() && "SystemData should be struct type");
+
+  Builder->CreateStore(SystemDataArg, SystemData);
+  while (!Data.CpsIntrinsicCalls.empty()) {
+    // Ensure the list gets freed, since otherwise we will process the same calls twice by accident.
+    auto *Call = Data.CpsIntrinsicCalls.pop_back_val();
+    replaceIntrinsicCall(*Builder, SystemDataArg->getType(), SystemData, *Stage, Call,
+                         GpurtLibrary ? GpurtLibrary : &Mod, CrossInliner);
   }
 }
 
@@ -599,6 +698,22 @@ void CleanupContinuationsPassImpl::lowerGetResumePoint(Module &Mod) {
       auto *ResumePtr = Builder->CreateZExt(ResumeFn, Builder->getInt64Ty());
       GetResumeCall->replaceAllUsesWith(ResumePtr);
       GetResumeCall->eraseFromParent();
+
+      // Re-create the lgc.cps.jump call without the return address
+      // argument, since the calling code handles it manually.
+      if (!lgc::cps::isCpsFunction(*Jump->getFunction())) {
+        SmallVector<Value *> Args;
+        for (unsigned I = 0; I < Jump->arg_size(); I++) {
+          if (I != 4) // Return address argument
+            Args.push_back(Jump->getArgOperand(I));
+        }
+
+        Builder->SetInsertPoint(Jump);
+        auto *NewCall = Builder->CreateCall(Jump->getCalledFunction(), Args);
+        NewCall->copyMetadata(*Jump);
+
+        Jump->eraseFromParent();
+      }
     }
   }
 }
@@ -681,8 +796,6 @@ llvm::PreservedAnalyses CleanupContinuationsPassImpl::run() {
   bool Changed = false;
   if (!ToProcess.empty()) {
     processContinuations();
-    // Lower lgc.rt intrinsics
-    lowerIntrinsicCall(Mod);
 
     lowerGetResumePoint(Mod);
     Changed = true;
diff --git a/llvmraytracing/lib/Continuations.cpp b/llvmraytracing/lib/Continuations.cpp
index a9bb95cc3a..5e064228a7 100644
--- a/llvmraytracing/lib/Continuations.cpp
+++ b/llvmraytracing/lib/Continuations.cpp
@@ -34,6 +34,7 @@
 #include "compilerutils/DxilToLlvm.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "llvmraytracing/GpurtContext.h"
+#include "llvmraytracing/SpecializeDriverShaders.h"
 #include "lgc/LgcCpsDialect.h"
 #include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
@@ -112,10 +113,6 @@ void llvm::replaceCallsToFunction(Function &F, Value &Replacement) {
   });
 }
 
-bool llvm::isLgcRtOp(const llvm::Function *F) {
-  return F && F->getName().starts_with("lgc.rt.");
-}
-
 void llvm::moveFunctionBody(Function &OldFunc, Function &NewFunc) {
   while (!OldFunc.empty()) {
     BasicBlock *BB = &OldFunc.front();
@@ -125,7 +122,7 @@ void llvm::moveFunctionBody(Function &OldFunc, Function &NewFunc) {
 }
 
 std::optional<llvm::GpuRtIntrinsicEntry> llvm::findIntrImplEntryByIntrinsicCall(CallInst *Call) {
-  if (!isLgcRtOp(Call->getCalledFunction()))
+  if (!lgc::rt::LgcRtDialect::isDialectOp(*Call->getCalledFunction()))
     return std::nullopt;
 
   auto ImplEntry = LgcRtGpuRtMap.find(*Call);
@@ -140,7 +137,7 @@ bool llvm::removeUnusedFunctionDecls(Module *Mod, bool OnlyIntrinsics) {
 
   for (Function &F : make_early_inc_range(*Mod)) {
     if (F.isDeclaration() && F.user_empty()) {
-      if (!OnlyIntrinsics || (isLgcRtOp(&F) || F.getName().starts_with("dx.op."))) {
+      if (!OnlyIntrinsics || (lgc::rt::LgcRtDialect::isDialectOp(F) || F.getName().starts_with("dx.op."))) {
         F.eraseFromParent();
         DidChange = true;
       }
@@ -153,7 +150,7 @@ bool llvm::removeUnusedFunctionDecls(Module *Mod, bool OnlyIntrinsics) {
 bool ContHelper::isRematerializableLgcRtOp(CallInst &CInst, std::optional<lgc::rt::RayTracingShaderStage> Kind) {
   using namespace lgc::rt;
   Function *Callee = CInst.getCalledFunction();
-  if (!llvm::isLgcRtOp(Callee))
+  if (!LgcRtDialect::isDialectOp(*Callee))
     return false;
 
   // Always rematerialize
@@ -510,6 +507,9 @@ void ContHelper::addContinuationPasses(ModulePassManager &MPM) {
   // Convert the system data struct to a value, so it isn't stored in the
   // continuation state
   MPM.addPass(createModuleToFunctionPassAdaptor(SROAPass(llvm::SROAOptions::ModifyCFG)));
+
+  MPM.addPass(SpecializeDriverShadersPass());
+
   MPM.addPass(LowerAwaitPass());
 
   MPM.addPass(CoroEarlyPass());
@@ -517,7 +517,7 @@ void ContHelper::addContinuationPasses(ModulePassManager &MPM) {
   MPM.addPass(createModuleToFunctionPassAdaptor(CoroElidePass()));
   MPM.addPass(CoroCleanupPass());
 
-  MPM.addPass(LegacyCleanupContinuationsPass());
+  MPM.addPass(DXILCleanupContinuationsPass());
   MPM.addPass(ContinuationsStatsReportPass());
   MPM.addPass(DXILContPostProcessPass());
 
@@ -833,6 +833,7 @@ CallInst *llvm::replaceIntrinsicCall(IRBuilder<> &B, Type *SystemDataTy, Value *
   }
 
   // Tolerate Replacement returning a single-element struct containing a value of the right type.
+  // That happens when the called function is _cont_ObjectToWorld4x3 (and possibly others) from LLPCFE.
   if (!Call->getType()->isVoidTy() && Call->getType() != Replacement->getType()) {
     assert(cast<StructType>(Replacement->getType())->getNumElements() == 1);
     Replacement = B.CreateExtractValue(Replacement, 0);
@@ -851,7 +852,6 @@ CallInst *llvm::replaceIntrinsicCall(IRBuilder<> &B, Type *SystemDataTy, Value *
 static bool replaceEnqueueIntrinsic(Function &F) {
   bool Changed = false;
   StringRef FuncName = F.getName();
-  bool IsEnqueueCall = FuncName.contains("EnqueueCall");
   bool IsWaitEnqueue = FuncName.contains("WaitEnqueue");
   llvm_dialects::Builder B{F.getContext()};
 
@@ -860,15 +860,7 @@ static bool replaceEnqueueIntrinsic(Function &F) {
     CallInst *NewCall = nullptr;
     Value *WaitMask = nullptr;
     Value *RetAddr = nullptr;
-    if (IsEnqueueCall) {
-      // Add the current function as return address to the call.
-      // Used when Traversal calls AnyHit or Intersection.
-      RetAddr = B.create<lgc::cps::AsContinuationReferenceOp>(B.getInt64Ty(), CInst.getFunction());
-      // Handle WaitEnqueueCall.
-      if (IsWaitEnqueue)
-        WaitMask = CInst.getArgOperand(1);
-
-    } else if (IsWaitEnqueue) {
+    if (IsWaitEnqueue) {
       // Handle WaitEnqueue.
       WaitMask = CInst.getArgOperand(1);
       RetAddr = CInst.getArgOperand(2);
@@ -883,10 +875,16 @@ static bool replaceEnqueueIntrinsic(Function &F) {
     // defined in the LgcCpsDialect.
     const uint32_t DummyLevelsArg = -1;
     Value *DummyContState = PoisonValue::get(StructType::get(B.getContext()));
-    NewCall = B.create<lgc::cps::JumpOp>(CInst.getArgOperand(0), DummyLevelsArg, DummyContState, RetAddr, TailArgs);
-
-    if (WaitMask)
-      ContHelper::setWaitMask(*NewCall, cast<ConstantInt>(WaitMask)->getSExtValue());
+    Value *DummyCsp = PoisonValue::get(B.getInt32Ty());
+    NewCall =
+        B.create<lgc::cps::JumpOp>(CInst.getArgOperand(0), DummyLevelsArg, DummyContState, DummyCsp, RetAddr, TailArgs);
+
+    if (WaitMask) {
+      // The only supported wait mask is a constant -1. We don't enforce having a constant here because the SPIR-V
+      // build of GPURT isn't optimized.
+      assert(!isa<ConstantInt>(WaitMask) || cast<ConstantInt>(WaitMask)->getSExtValue() == -1);
+      ContHelper::setWaitMask(*NewCall);
+    }
 
     // NOTE: Inlining ExitRayGen in LowerRaytracingPipeline can cause continue
     // ops whose name is suffixed .cloned.*, which don't get picked up by the
@@ -908,21 +906,12 @@ static void handleContinuationStackIsGlobal(Function &Func, ContStackAddrspace S
          // bool
          && Func.getFunctionType()->getReturnType()->isIntegerTy(1));
 
-  auto *IsGlobal = ConstantInt::getBool(Func.getContext(), StackAddrspace == ContStackAddrspace::Global);
+  auto *IsGlobal = ConstantInt::getBool(Func.getContext(), StackAddrspace == ContStackAddrspace::Global ||
+                                                               StackAddrspace == ContStackAddrspace::GlobalLLPC);
 
   llvm::replaceCallsToFunction(Func, *IsGlobal);
 }
 
-static void handleContinuationsGetFlags(Function &Func, uint32_t Flags) {
-  assert(Func.arg_empty()
-         // i32
-         && Func.getFunctionType()->getReturnType()->isIntegerTy(32));
-
-  auto *FlagsConst = ConstantInt::get(IntegerType::get(Func.getContext(), 32), Flags);
-
-  llvm::replaceCallsToFunction(Func, *FlagsConst);
-}
-
 static void handleGetRtip(Function &Func, uint32_t RtipLevel) {
   assert(Func.arg_empty()
          // i32
@@ -1082,7 +1071,7 @@ void llvm::terminateShader(IRBuilder<> &Builder, CallInst *CompleteCall) {
   Type *FuncRetTy = CompleteCall->getFunction()->getReturnType();
   // For functions returning a value, return a poison. Resume functions
   // and other shaders will simply return a void value when this helper is being
-  // called from LegacyCleanupContinuations. These will be treated as
+  // called from CleanupContinuations. These will be treated as
   // continuation.complete by the translator.
   ReturnInst *Ret = nullptr;
   if (FuncRetTy->isVoidTy())
@@ -1099,21 +1088,20 @@ void llvm::terminateShader(IRBuilder<> &Builder, CallInst *CompleteCall) {
   // - Remove the complete call.
   // This is intended to work for _AmdComplete appearing in conditional code
   // or the unreachable inserted by various passes before
-  // LegacyCleanupContinuations.
+  // CleanupContinuations.
   SplitBlock(CompleteCall->getParent(), CompleteCall);
   // Remove the branch to the split block.
   Ret->getParent()->getTerminator()->eraseFromParent();
   CompleteCall->eraseFromParent();
 }
 
-bool llvm::earlyDriverTransform(Module &M) {
+bool llvm::earlyGpurtTransform(Module &M) {
   // Import StackAddrspace from metadata if set, otherwise from default
   auto StackAddrspaceMD = ContHelper::tryGetStackAddrspace(M);
   auto StackAddrspace = StackAddrspaceMD.value_or(ContHelper::DefaultStackAddrspace);
 
   // Import from metadata if set
   auto RtipLevel = ContHelper::Rtip::tryGetValue(&M);
-  auto Flags = ContHelper::Flags::tryGetValue(&M);
   SmallVector<ContSetting> GpurtSettings;
   ContHelper::getGpurtSettings(M, GpurtSettings);
 
@@ -1129,12 +1117,6 @@ bool llvm::earlyDriverTransform(Module &M) {
     if (Name.starts_with("_AmdContinuationStackIsGlobal")) {
       Changed = true;
       handleContinuationStackIsGlobal(F, StackAddrspace);
-    } else if (Name.starts_with("_AmdContinuationsGetFlags")) {
-      Changed = true;
-      if (!Flags)
-        report_fatal_error("Tried to get continuation flags but it is not "
-                           "available on the module");
-      handleContinuationsGetFlags(F, *Flags);
     } else if (Name.starts_with("_AmdGetRtip")) {
       Changed = true;
       if (!RtipLevel)
diff --git a/llvmraytracing/lib/CpsStackLowering.cpp b/llvmraytracing/lib/CpsStackLowering.cpp
index 1ea139b1b3..02ba0569e5 100644
--- a/llvmraytracing/lib/CpsStackLowering.cpp
+++ b/llvmraytracing/lib/CpsStackLowering.cpp
@@ -120,7 +120,11 @@ void CpsStackLowering::visitGetElementPtr(GetElementPtrInst &GEP) {
   unsigned BitWidth = DL.getIndexSizeInBits(GEP.getPointerAddressSpace());
 
   APInt ConstantOffset{BitWidth, 0};
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 513542
   MapVector<Value *, APInt> VariableOffsets;
+#else
+  SmallMapVector<Value *, APInt, 4> VariableOffsets;
+#endif
 
   [[maybe_unused]] bool Success = GEP.collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset);
   assert(Success && "CpsStackLowering::visitGetElementPtr: GEP.collectOffset "
diff --git a/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp b/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp
index 85031c1e6c..6c3868e264 100644
--- a/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp
+++ b/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp
@@ -33,6 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "compilerutils/ArgPromotion.h"
+#include "compilerutils/DxilUtils.h"
 #include "llvmraytracing/Continuations.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "lgc/LgcRtDialect.h"
@@ -60,29 +61,16 @@ static Function *transformFunction(Function &F) {
     // Local scope for Name which is invalidated at the end.
     auto Name = F.getName();
     LLVM_DEBUG(dbgs() << "Transforming function " << Name << "\n");
-    std::string NewName = Name.str();
-
-    // Unmangle declarations because they cannot be renamed in the dx api
-    if (Name.contains('@')) {
-      // Extract unmangled name
-      auto Start = Name.find('?') + 1;
-      auto End = Name.find('@', Start);
-      if (Start == 0 || End == StringRef::npos || Start > Name.size() || End > Name.size()) {
-        report_fatal_error(Twine("Failed to unmangle function name: Failed to extract from '") + Name +
-                           "' (start: " + Twine(Start) + ", end: " + Twine(End) + ")");
-      }
-
-      // Copy name, otherwise it will be deleted before it's set
-      NewName = Name.substr(Start, End - Start).str();
-    }
+    // Copy name, otherwise it will be deleted before it is set
+    std::string NewName = CompilerUtils::dxil::tryDemangleFunctionName(Name.str()).str();
 
     LLVM_DEBUG(dbgs() << "  Set new name " << NewName << "\n");
+    F.setName(NewName);
 
     if (NewName == ContDriverFunc::TraversalName)
       lgc::rt::setLgcRtShaderStage(&F, lgc::rt::RayTracingShaderStage::Traversal);
     else if (NewName == ContDriverFunc::KernelEntryName)
       lgc::rt::setLgcRtShaderStage(&F, lgc::rt::RayTracingShaderStage::KernelEntry);
-    F.setName(NewName);
   }
 
   // Unpack the inner type of @class.matrix types
@@ -183,6 +171,19 @@ static void handleIsLlpc(Function &Func) {
   llvm::replaceCallsToFunction(Func, *FalseConst);
 }
 
+static void handleGetShaderRecordIndex(llvm_dialects::Builder &B, Function &Func) {
+  assert(Func.arg_empty()
+         // bool
+         && Func.getFunctionType()->getReturnType()->isIntegerTy(32));
+
+  llvm::forEachCall(Func, [&](CallInst &CInst) {
+    B.SetInsertPoint(&CInst);
+    auto *ShaderIndexCall = B.create<lgc::rt::ShaderIndexOp>();
+    CInst.replaceAllUsesWith(ShaderIndexCall);
+    CInst.eraseFromParent();
+  });
+}
+
 llvm::PreservedAnalyses DXILContIntrinsicPreparePass::run(llvm::Module &M,
                                                           llvm::ModuleAnalysisManager &AnalysisManager) {
   LLVM_DEBUG(dbgs() << "Run the dxil-cont-intrinsic-prepare pass\n");
@@ -191,6 +192,8 @@ llvm::PreservedAnalyses DXILContIntrinsicPreparePass::run(llvm::Module &M,
 
   SmallVector<Function *> Funcs(make_pointer_range(M.functions()));
 
+  llvm_dialects::Builder B{M.getContext()};
+
   for (auto *F : Funcs) {
     auto Name = F->getName();
     bool ShouldTransform = false;
@@ -206,6 +209,9 @@ llvm::PreservedAnalyses DXILContIntrinsicPreparePass::run(llvm::Module &M,
       } else if (Name.contains("IsLlpc")) {
         ShouldTransform = false;
         handleIsLlpc(*F);
+      } else if (Name.contains("GetShaderRecordIndex")) {
+        ShouldTransform = false;
+        handleGetShaderRecordIndex(B, *F);
       }
     }
 
@@ -215,7 +221,7 @@ llvm::PreservedAnalyses DXILContIntrinsicPreparePass::run(llvm::Module &M,
 
   fixupDxilMetadata(M);
 
-  earlyDriverTransform(M);
+  earlyGpurtTransform(M);
 
   return PreservedAnalyses::none();
 }
diff --git a/llvmraytracing/lib/DXILContPostProcess.cpp b/llvmraytracing/lib/DXILContPostProcess.cpp
index 84bf959c42..cfd0073f1c 100644
--- a/llvmraytracing/lib/DXILContPostProcess.cpp
+++ b/llvmraytracing/lib/DXILContPostProcess.cpp
@@ -83,13 +83,10 @@ class DXILContPostProcessPassImpl final {
   };
 
 private:
-  void lowerGetResumePointAddr(Function &F);
-
   void handleContStackIntrinsic(FunctionAnalysisManager &FAM, Function &F);
 
   void initializeProcessableFunctionData();
-  bool replaceIntrinsicCalls(Function &F, const FunctionData &Data);
-  bool handleIntrinsicCalls(llvm::ModuleAnalysisManager &AnalysisManager);
+  bool handleContStackIntrinsics(llvm::ModuleAnalysisManager &AnalysisManager);
   bool lowerCpsOps();
   void lowerJumpOp(lgc::cps::JumpOp &JumpOp);
   void lowerAsContinuationReferenceOp(lgc::cps::AsContinuationReferenceOp &AsCrOp);
@@ -155,81 +152,6 @@ static Function *getContinuationGetAddrAndMD(Module &M) {
   }
 }
 
-void DXILContPostProcessPassImpl::lowerGetResumePointAddr(Function &F) {
-  auto *GetResumePointAddr = &F;
-
-  assert(GetResumePointAddr->getReturnType()->isIntegerTy(64) && GetResumePointAddr->arg_size() == 0);
-
-  // Search calls to GetResumePointAddr, and lower it to the argument of the
-  // next continue call. Then remove it from that continue call.
-  for (auto &Use : make_early_inc_range(GetResumePointAddr->uses())) {
-    auto *CInst = dyn_cast<CallInst>(Use.getUser());
-    if (!CInst || !CInst->isCallee(&Use) || ToProcess.count(CInst->getFunction()) == 0) {
-      // Non-call use, or call in unknown function. This will likely result in a
-      // remaining non-lowered call reported as error at the end of this
-      // function.
-      continue;
-    }
-
-    // Instead of passing the resume address to the next continue call,
-    // use it as the return value of GetResumePointAddr and remove it from
-    // the continue arguments.
-    auto FoundContinueCall = findDominatedContinueCall(CInst);
-
-    if (!FoundContinueCall) {
-      report_fatal_error("Did not find a continue call after a "
-                         "GetResumePointAddr");
-    }
-    auto *ContinueCall = *FoundContinueCall;
-
-    unsigned ReturnAddrArgNum = 1;
-    Value *ReturnAddr = nullptr;
-
-    if (auto *Jump = dyn_cast<lgc::cps::JumpOp>(ContinueCall)) {
-      ReturnAddrArgNum = 3;
-      ReturnAddr = Jump->getRcr();
-    }
-
-    assert((ReturnAddr->getType() == Builder.getInt64Ty()) && "Unexpected return addr type!");
-
-    SmallVector<Instruction *> MoveInstrs;
-    if (auto *I = dyn_cast<Instruction>(ReturnAddr)) {
-      if (!I->comesBefore(CInst))
-        MoveInstrs.push_back(I);
-    }
-
-    unsigned Done = 0;
-    while (Done < MoveInstrs.size()) {
-      for (auto &O : MoveInstrs[Done]->operands()) {
-        if (auto *I = dyn_cast<Instruction>(O)) {
-          if (!I->comesBefore(CInst))
-            MoveInstrs.push_back(I);
-        }
-      }
-      ++Done;
-    }
-    for (auto I = MoveInstrs.rbegin(), E = MoveInstrs.rend(); I != E; ++I)
-      (*I)->moveBefore(CInst);
-
-    CInst->replaceAllUsesWith(ReturnAddr);
-
-    // Re-create the lgc.ilcps.continue / lgc.cps.jump call without the return address
-    // argument.
-    SmallVector<Value *> Args;
-    for (unsigned I = 0; I < ContinueCall->arg_size(); I++) {
-      if (I != ReturnAddrArgNum)
-        Args.push_back(ContinueCall->getArgOperand(I));
-    }
-
-    Builder.SetInsertPoint(ContinueCall);
-    auto *NewCall = Builder.CreateCall(ContinueCall->getCalledFunction(), Args);
-    NewCall->copyMetadata(*ContinueCall);
-
-    CInst->eraseFromParent();
-    ContinueCall->eraseFromParent();
-  }
-}
-
 // Replace calls to _AmdContStack* with calls to lgc.cps dialect ops.
 // Do some simple constant propagation on the fly.
 void DXILContPostProcessPassImpl::handleContStackIntrinsic(FunctionAnalysisManager &FAM, Function &F) {
@@ -413,25 +335,11 @@ void DXILContPostProcessPassImpl::initializeProcessableFunctionData() {
   }
 }
 
-bool DXILContPostProcessPassImpl::handleIntrinsicCalls(llvm::ModuleAnalysisManager &AnalysisManager) {
+bool DXILContPostProcessPassImpl::handleContStackIntrinsics(llvm::ModuleAnalysisManager &AnalysisManager) {
   bool Changed = false;
 
   for (auto &F : Mod->functions()) {
-    auto Name = F.getName();
-    if (Name.starts_with("lgc.rt")) {
-      // Search for known HLSL intrinsics
-      llvm::forEachCall(F, [&](CallInst &CInst) {
-        auto Data = ToProcess.find(CInst.getFunction());
-        if (Data != ToProcess.end()) {
-          auto IntrImplEntry = llvm::findIntrImplEntryByIntrinsicCall(&CInst);
-          if (IntrImplEntry == std::nullopt)
-            return;
-
-          Data->second.IntrinsicCalls.push_back(&CInst);
-          Changed = true;
-        }
-      });
-    } else if (Name.contains("ContStack")) {
+    if (F.getName().contains("ContStack")) {
       Changed = true;
 
       auto &FAM = AnalysisManager.getResult<FunctionAnalysisManagerModuleProxy>(*Mod).getManager();
@@ -443,33 +351,6 @@ bool DXILContPostProcessPassImpl::handleIntrinsicCalls(llvm::ModuleAnalysisManag
   return Changed;
 }
 
-bool DXILContPostProcessPassImpl::replaceIntrinsicCalls(Function &F, const FunctionData &Data) {
-  if (Data.IntrinsicCalls.empty())
-    return false;
-
-  [[maybe_unused]] auto *FuncTy = F.getFunctionType();
-
-  assert(FuncTy->getNumParams() > Data.SystemDataArgumentIndex && "Missing system data argument");
-  Builder.SetInsertPointPastAllocas(&F);
-
-  // Intrinsics need a pointer, so allocate and store the system data argument
-  Value *SystemDataArgument = F.getArg(Data.SystemDataArgumentIndex);
-  Value *SystemDataPtr = Builder.CreateAlloca(Data.SystemDataTy);
-  SystemDataPtr->setName("system.data.alloca");
-  // Extract the original system data from the { systemData, padding, payload }
-  // struct returned by await.
-  if (!Data.IsStart)
-    SystemDataArgument = Builder.CreateExtractValue(SystemDataArgument, 0);
-  Builder.CreateStore(SystemDataArgument, SystemDataPtr);
-
-  for (auto *Call : Data.IntrinsicCalls)
-    replaceIntrinsicCall(Builder, Data.SystemDataTy, SystemDataPtr,
-                         ShaderStageHelper::dxilShaderKindToRtShaderStage(Data.Kind).value(), Call, GpurtLibrary,
-                         CrossInliner);
-
-  return true;
-}
-
 //
 // Entry point for all lgc.cps lowering.
 //
@@ -480,7 +361,6 @@ bool DXILContPostProcessPassImpl::lowerCpsOps() {
     DXILContPostProcessPassImpl &Self;
     bool &Changed;
     llvm_dialects::Builder &Builder;
-    Function *GetAddrAndMD;
   };
 
   // Note: It is a bit unlucky that we are using both a visitor for
@@ -501,7 +381,7 @@ bool DXILContPostProcessPassImpl::lowerCpsOps() {
                                      })
                                      .build();
 
-  CpsVisitorState State{*this, Changed, Builder, getContinuationGetAddrAndMD(*Mod)};
+  CpsVisitorState State{*this, Changed, Builder};
 
   struct CspCandidateInfo {
     bool RequiresCspArgument = false;
@@ -553,12 +433,12 @@ void DXILContPostProcessPassImpl::lowerJumpOp(lgc::cps::JumpOp &JumpOp) {
 
   SmallVector<Value *> TailArgs{JumpOp.getTail()};
   Value *RetAddr = Builder.CreateZExt(JumpOp.getRcr(), Builder.getInt64Ty());
-  if (auto WaitMask = ContHelper::tryGetWaitMask(JumpOp)) {
-    ContinueOp = Builder.create<lgc::ilcps::WaitContinueOp>(RCR, Builder.getInt64(WaitMask.value()),
-                                                            PoisonValue::get(Builder.getInt32Ty()), RetAddr, TailArgs);
+  if (ContHelper::isWaitAwaitCall(JumpOp)) {
+    ContinueOp =
+        Builder.create<lgc::ilcps::WaitContinueOp>(RCR, Builder.getInt64(-1), JumpOp.getCsp(), RetAddr, TailArgs);
     ContHelper::removeWaitMask(JumpOp);
   } else {
-    ContinueOp = Builder.create<lgc::ilcps::ContinueOp>(RCR, PoisonValue::get(Builder.getInt32Ty()), RetAddr, TailArgs);
+    ContinueOp = Builder.create<lgc::ilcps::ContinueOp>(RCR, JumpOp.getCsp(), RetAddr, TailArgs);
   }
 
   ContinueOp->copyMetadata(JumpOp);
@@ -656,20 +536,11 @@ PreservedAnalyses DXILContPostProcessPassImpl::run(ModuleAnalysisManager &Analys
   initializeProcessableFunctionData();
 
   Changed |= handleAmdInternals();
-  Changed |= handleIntrinsicCalls(AnalysisManager);
+  Changed |= handleContStackIntrinsics(AnalysisManager);
 
   for (auto &[Func, Data] : ToProcess) {
     ContHelper::IncomingRegisterCount::reset(Func);
     ContHelper::ContinuationStateByteCount::reset(Func);
-    Changed |= replaceIntrinsicCalls(*Func, Data);
-  }
-
-  for (auto &F : make_early_inc_range(*Mod)) {
-    auto FuncName = F.getName();
-    if (FuncName.starts_with("_AmdGetResumePointAddr")) {
-      Changed = true;
-      lowerGetResumePointAddr(F);
-    }
   }
 
   Changed |= lowerCpsOps();
diff --git a/llvmraytracing/lib/LegacyCleanupContinuations.cpp b/llvmraytracing/lib/LegacyCleanupContinuations.cpp
deleted file mode 100644
index 6a6c2b921b..0000000000
--- a/llvmraytracing/lib/LegacyCleanupContinuations.cpp
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to
- *  deal in the Software without restriction, including without limitation the
- *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- *  sell copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- *  IN THE SOFTWARE.
- *
- **********************************************************************************************************************/
-
-//= LegacyCleanupContinuations.cpp - Post-process output of coroutine passes =//
-//
-// Convert the result from the coroutine passes to something more suitable for
-// the compiler backend.
-//
-// Instead of return values, use continue and waitContinue intrinsics.
-// Add arguments to resume functions, which are the return values of the called
-// continuation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "compilerutils/CompilerUtils.h"
-#include "llvmraytracing/Continuations.h"
-#include "llvmraytracing/ContinuationsUtil.h"
-#include "lgc/LgcCpsDialect.h"
-#include "lgc/LgcIlCpsDialect.h"
-#include "lgc/LgcRtDialect.h"
-#include "llvm-dialects/Dialect/Builder.h"
-#include "llvm-dialects/Dialect/Visitor.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "legacy-cleanup-continuations"
-
-namespace {
-
-class LegacyCleanupContinuationsPassImpl {
-public:
-  LegacyCleanupContinuationsPassImpl(llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager);
-
-  PreservedAnalyses run();
-
-private:
-  struct ContinuationData {
-    /// All functions belonging to this continuation, the entry function is the
-    /// first one
-    SmallVector<Function *> Functions;
-    /// Size of the continuation state in byte
-    uint32_t ContStateBytes = 0;
-    CallInst *MallocCall = nullptr;
-    MDNode *MD = nullptr;
-    // The continuation state on the CPS stack
-    Value *NewContState = nullptr;
-    /// Cleaned entry function, used to replace metadata
-    Function *NewStart = nullptr;
-
-    // Returns the number of bytes used on the CPS stack for the continuation
-    // state.
-    uint32_t getContStateStackBytes() const { return alignTo(ContStateBytes, RegisterBytes); }
-  };
-
-  void analyzeContinuation(Function &F, MDNode *MD);
-  // Run analysis parts that need to wait until all resume functions have been
-  // collected
-  void finalizeContinuationData(Function &StartFunc, ContinuationData &Data);
-  void processContinuation(Function *StartFunc, ContinuationData &FuncData);
-  void handleFunctionEntry(ContinuationData &Data, Function *F, bool IsEntry);
-  void handleContinue(ContinuationData &Data, Instruction *Ret);
-  void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun);
-  bool lowerCompleteOp(Module &M);
-
-  Module &M;
-  LLVMContext &Context;
-  llvm::FunctionAnalysisManager &FAM;
-  llvm_dialects::Builder B;
-  Type *I32 = nullptr;
-  Type *I64 = nullptr;
-  Function *ContMalloc = nullptr;
-  Function *ContFree = nullptr;
-  MapVector<Function *, ContinuationData> ToProcess;
-  CompilerUtils::CrossModuleInliner CrossInliner;
-};
-
-/// Find the original call that created the continuation token and the matching
-/// resume function for a return value.
-///
-/// Returns a map (origin BB, (call that created the continuation token, resume
-/// function)).
-DenseMap<BasicBlock *, std::pair<CallInst *, Value *>> findTokenOrigin(BasicBlock *BB, Value *V,
-                                                                       SmallVectorImpl<Instruction *> &ToRemove) {
-  DenseMap<BasicBlock *, std::pair<CallInst *, Value *>> Result;
-  Value *Call = nullptr;
-  Value *ResumeFun = nullptr;
-  while (auto *Insert = dyn_cast<InsertValueInst>(V)) {
-    LLVM_DEBUG(dbgs() << "Insert: " << *Insert << "\n");
-    assert(Insert->getNumIndices() == 1 && "Expected a flat struct");
-    if (*Insert->idx_begin() == 0)
-      ResumeFun = Insert->getInsertedValueOperand();
-    else if (*Insert->idx_begin() == 1)
-      Call = Insert->getInsertedValueOperand();
-    V = Insert->getAggregateOperand();
-    ToRemove.push_back(Insert);
-  }
-
-  if (!ResumeFun) {
-    if (auto *Const = dyn_cast<ConstantStruct>(V))
-      ResumeFun = Const->getOperand(0);
-  }
-
-  assert(Call && "Did not find call that creates the token");
-  assert(ResumeFun && "Did not find resume function");
-
-  // Strip bitcast
-  if (auto *Cast = dyn_cast<BitCastInst>(ResumeFun)) {
-    ResumeFun = Cast->getOperand(0);
-    ToRemove.push_back(Cast);
-  }
-  if (auto *Const = dyn_cast<ConstantExpr>(ResumeFun)) {
-    if (Const->isCast())
-      ResumeFun = Const->getOperand(0);
-  }
-
-  auto RegisterTokenOrigin = [&Result](BasicBlock *TheBB, Value *Token, Value *TheResumeFun) {
-    assert(isa<Constant>(TheResumeFun) && "Resume function should be a constant function");
-    // Strip away bitcasts -- this can happen with multiple token types
-    if (auto *TokenBitcast = dyn_cast<BitCastOperator>(Token))
-      Token = TokenBitcast->getOperand(0);
-    assert(isa<CallInst>(Token) && "Call should be a CallInst");
-    auto *CallI = cast<CallInst>(Token);
-    Result.insert(std::make_pair(TheBB, std::make_pair(CallI, TheResumeFun)));
-  };
-
-  // Walk through phis
-  if (auto *CallPhi = dyn_cast<PHINode>(Call)) {
-    assert(isa<PHINode>(ResumeFun) && "Resume fun should also be a phi node");
-    auto *ResumeFunPhi = cast<PHINode>(ResumeFun);
-    ToRemove.push_back(CallPhi);
-    ToRemove.push_back(ResumeFunPhi);
-
-    for (auto CallEntry : llvm::zip(CallPhi->blocks(), CallPhi->incoming_values())) {
-      auto *PhiBB = std::get<0>(CallEntry);
-      auto *ResumeFunEntry = ResumeFunPhi->getIncomingValueForBlock(PhiBB);
-      assert(ResumeFunEntry && "Need a resume fun for each call");
-      RegisterTokenOrigin(PhiBB, std::get<1>(CallEntry), ResumeFunEntry);
-    }
-  } else {
-    RegisterTokenOrigin(BB, Call, ResumeFun);
-  }
-  return Result;
-}
-
-void LegacyCleanupContinuationsPassImpl::analyzeContinuation(Function &F, MDNode *MD) {
-  // Only analyze main continuation
-  auto *MDTup = cast<MDTuple>(MD);
-  auto *EntryF = mdconst::extract<Function>(MDTup->getOperand(0));
-
-  auto &Data = ToProcess[EntryF];
-
-  if (&F != EntryF) {
-    Data.Functions.push_back(&F);
-    return;
-  }
-  Data.Functions.insert(Data.Functions.begin(), &F);
-  Data.MD = MD;
-
-  // Search the malloc call to find the size of the continuation state
-  if (ContMalloc) {
-    forEachCall(*ContMalloc, [&](CallInst &Call) {
-      if (Call.getFunction() == &F) {
-        Data.MallocCall = &Call;
-      }
-    });
-  }
-
-  // Without malloc call, we check later if the continuation state is used
-  if (Data.MallocCall) {
-    Data.ContStateBytes = cast<ConstantInt>(Data.MallocCall->getArgOperand(0))->getSExtValue();
-  }
-}
-
-void LegacyCleanupContinuationsPassImpl::finalizeContinuationData(Function &StartFunc, ContinuationData &FuncData) {
-  if (FuncData.MallocCall)
-    return;
-
-  for (auto *F : FuncData.Functions) {
-    bool IsStart = (F == &StartFunc); // If this is the continuation start
-    Value *ContFrame;
-    if (IsStart)
-      ContFrame = F->getArg(F->arg_size() - 1);
-    else
-      ContFrame = F->getArg(0);
-    // If there are uses, we need to assume a size of
-    // MinimumContinuationStateBytes, because for all sizes up to this size
-    // coroutine passes will not emit a malloc that we can use to determine
-    // the exact size. If however the frame pointer is not used in any of
-    // the continuation functions, it's safe to assume an empty continuation
-    // state.
-    if (!ContFrame->user_empty()) {
-      assert(FuncData.ContStateBytes == 0);
-      FuncData.ContStateBytes = MinimumContinuationStateBytes;
-      break;
-    }
-  }
-}
-
-// For a resume function, find the continue call to it (by looking at its uses)
-// and obtain the incoming payload register count into the resume function
-// as the outgoing register count of the continue call, indicated by metadata.
-uint32_t getIncomingRegisterCount(Function *ResumeFunc) {
-  // For non-start functions, set (incoming) continuation registercount
-  // metadata by looking at the continue calls that reference this
-  // function. These continue calls both specify the number of their
-  // outgoing registers, and the number of incoming payload registers
-  // coming back into the resume function (i.e. us).
-  SmallVector<User *> Worklist(ResumeFunc->users());
-  std::optional<uint32_t> RegCount;
-  while (!Worklist.empty()) {
-    auto *U = Worklist.pop_back_val();
-    if (isa<Constant>(U) || isa<lgc::cps::AsContinuationReferenceOp>(U)) {
-      Worklist.append(U->user_begin(), U->user_end());
-      continue;
-    }
-    assert(isa<CallInst>(U) && "User of a resume function should be a call to continue");
-    auto *Inst = cast<CallInst>(U);
-    if (auto Count = ContHelper::ReturnedRegisterCount::tryGetValue(Inst)) {
-      assert((!RegCount || *RegCount == *Count) && "Got different returned registercounts in continues to "
-                                                   "the same resume function");
-      RegCount = *Count;
-#ifdef NDEBUG
-      break;
-#endif
-    } else {
-      LLVM_DEBUG(Inst->dump());
-      report_fatal_error("Found a continue call without "
-                         "continuation returned registercount metadata");
-    }
-  }
-  return RegCount.value();
-}
-
-Value *getContFrame(CallInst *MallocCall, Function *F, bool IsStart, SmallVectorImpl<Instruction *> &InstsToRemove) {
-  Value *ContFrame = nullptr;
-  if (MallocCall) {
-    if (IsStart) {
-      ContFrame = MallocCall;
-      InstsToRemove.push_back(MallocCall);
-
-      auto *BufferArg = F->getArg(F->arg_size() - 1);
-      auto *User = BufferArg->getUniqueUndroppableUser();
-      auto *Cast = dyn_cast<BitCastInst>(User);
-      if (Cast)
-        User = Cast->getUniqueUndroppableUser();
-      auto *Store = cast<StoreInst>(User);
-      InstsToRemove.push_back(Store); // Store needs to be eliminated first
-      if (Cast)
-        InstsToRemove.push_back(Cast);
-    } else {
-      // Look for the load of the allocated pointer
-      auto *User = F->getArg(0)->getUniqueUndroppableUser();
-      auto *Cast = dyn_cast<BitCastInst>(User);
-      if (Cast)
-        User = Cast->getUniqueUndroppableUser();
-      auto *Load = cast<LoadInst>(User);
-      InstsToRemove.push_back(Load); // Load needs to be eliminated first
-      if (Cast)
-        InstsToRemove.push_back(Cast);
-      ContFrame = Load;
-    }
-  } else {
-    if (IsStart)
-      ContFrame = F->getArg(F->arg_size() - 1);
-    else
-      ContFrame = F->getArg(0);
-  }
-  return ContFrame;
-}
-
-bool LegacyCleanupContinuationsPassImpl::lowerCompleteOp(Module &M) {
-  struct VisitState {
-    llvm_dialects::Builder &Builder;
-    bool completeLowered;
-  };
-
-  bool completeLowered = false;
-  VisitState State = {B, completeLowered};
-  static auto Visitor = llvm_dialects::VisitorBuilder<VisitState>()
-                            .add<lgc::cps::CompleteOp>([](VisitState &State, auto &complete) {
-                              State.Builder.SetInsertPoint(&complete);
-                              llvm::terminateShader(State.Builder, &complete);
-                              State.completeLowered = true;
-                            })
-                            .build();
-
-  Visitor.visit(State, M);
-  return State.completeLowered;
-}
-
-void LegacyCleanupContinuationsPassImpl::processContinuation(Function *StartFunc, ContinuationData &FuncData) {
-  auto *Void = Type::getVoidTy(Context);
-  LLVM_DEBUG(dbgs() << "Processing function: " << StartFunc->getName() << "\n");
-  bool IsEntry = StartFunc->hasMetadata(ContHelper::MDEntryName);
-  // The start function must come first to setup FuncData.NewStart and
-  // ContMDTuple which is used by processing the resume functions.
-  assert(StartFunc == FuncData.Functions[0]);
-  MDTuple *ContMDTuple = nullptr;
-
-  SmallVector<Function *> ToRemove;
-  struct NewFunctionInfo {
-    Function *Func;
-    bool IsStart;
-  };
-  SmallVector<NewFunctionInfo> NewFuncs;
-
-  for (auto *F : FuncData.Functions) {
-    if (F != StartFunc) {
-      // Entry marker should only be on the start and not on resume functions
-      F->eraseMetadata(Context.getMDKindID(ContHelper::MDEntryName));
-      // Same for stacksize
-      F->eraseMetadata(Context.getMDKindID(ContHelper::MDStackSizeName));
-      // Set same linkage as for start function
-      F->setLinkage(StartFunc->getLinkage());
-    }
-
-    // Ignore the stub created for the coroutine passes
-    if (F->empty())
-      return;
-
-    LLVM_DEBUG(dbgs() << "Processing function part: " << F->getName() << "\n");
-
-    bool IsStart = F == StartFunc; // If this is the continuation start
-    SmallVector<Type *> AllArgTypes;
-    SmallVector<Value *> AllArgValues;
-    SmallVector<Instruction *> InstsToRemove;
-    AttributeList FAttrs = F->getAttributes();
-    SmallVector<AttributeSet> ParamAttrs;
-
-    // Use all arguments except the last (pre-allocated buffer for the
-    // coroutine passes) for the continuation start
-    if (IsStart) {
-      unsigned ArgNo = 0;
-      assert(F->arg_size() >= 1 && "Entry function has at least one argument");
-      for (auto Arg = F->arg_begin(), ArgEnd = F->arg_end() - 1; Arg != ArgEnd; Arg++) {
-        AllArgTypes.push_back(Arg->getType());
-        AllArgValues.push_back(Arg);
-        ParamAttrs.push_back(FAttrs.getParamAttrs(ArgNo));
-        ArgNo++;
-      }
-    } else {
-      B.SetInsertPoint(&*F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
-
-      AllArgTypes.push_back(B.getInt64Ty()); // Dummy return address for resume functions
-      AllArgValues.push_back(nullptr);
-
-      // Find arguments from lgc.ilcps.getreturnvalue calls
-      for (auto &I : F->getEntryBlock()) {
-        if (auto *Intr = dyn_cast<lgc::ilcps::GetReturnValueOp>(&I)) {
-          AllArgTypes.push_back(Intr->getType());
-          AllArgValues.push_back(Intr);
-          InstsToRemove.push_back(Intr);
-        }
-      }
-    }
-
-    // Find the free call if there is one
-    if (ContFree) {
-      forEachCall(*ContFree, [&](CallInst &CI) { InstsToRemove.push_back(&CI); });
-    }
-
-    // Find the continuation state pointer, either returned by the malloc or
-    // given as an argument
-    Value *ContFrame = getContFrame(FuncData.MallocCall, F, IsStart, InstsToRemove);
-
-    // Try to eliminate unnecessary continuation state accesses
-    // of values that are still available as SSA values by a simple
-    // store-to-load forwarding routine.
-    // Ideally, LLVM coro passes should do better and not emit these
-    // loads to begin with.
-    auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
-    forwardContinuationFrameStoreToLoad(DT, ContFrame);
-
-    // Create new empty function
-    F->eraseMetadata(FuncData.MD->getMetadataID());
-    auto *NewFuncTy = FunctionType::get(Void, AllArgTypes, false);
-    Function *NewFunc = CompilerUtils::cloneFunctionHeader(*F, NewFuncTy, ParamAttrs);
-    NewFunc->takeName(F);
-    NewFuncs.push_back({NewFunc, IsStart});
-
-    // Transfer code from old function to new function
-    llvm::moveFunctionBody(*F, *NewFunc);
-
-    // Set arg names for new function
-    // Skip the dummy return address for non-start functions
-    for (unsigned Idx = 0; Idx != NewFunc->getFunctionType()->params().size(); ++Idx) {
-      Value *OldVal = AllArgValues[Idx];
-      // Skip the dummy return address.
-      if (!OldVal)
-        continue;
-
-      Argument *Arg = NewFunc->getArg(Idx);
-      Arg->setName(OldVal->getName());
-      OldVal->replaceAllUsesWith(Arg);
-
-      if (IsStart) {
-        Argument *OldArg = F->getArg(Idx);
-        if (OldArg->hasInRegAttr())
-          Arg->addAttr(Attribute::InReg);
-        else
-          Arg->removeAttr(Attribute::AttrKind::InReg);
-      }
-    }
-
-    // Handle the function entry
-    B.SetInsertPoint(&*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
-    if (IsStart) {
-      FuncData.NewStart = NewFunc;
-      ContMDTuple = MDTuple::get(Context, {ValueAsMetadata::get(FuncData.NewStart)});
-    }
-    handleFunctionEntry(FuncData, NewFunc, IsEntry);
-
-    // Handle the function body
-
-    if (FuncData.NewContState) {
-      // Bitcast new cont state to the pointer type used by coro passes, but
-      // preserve the address space. Uses of the pointer are then fixed to also
-      // use the correct address space.
-      PointerType *UsedContFrameTy = cast<PointerType>(ContFrame->getType());
-      Value *CastNewContState = B.CreateBitCast(
-          FuncData.NewContState,
-          getWithSamePointeeType(UsedContFrameTy, FuncData.NewContState->getType()->getPointerAddressSpace()));
-      CompilerUtils::replaceAllPointerUses(&B, ContFrame, CastNewContState, InstsToRemove);
-    } else {
-      // If there is no continuation state, replace it with a poison
-      // value instead of a zero-sized stack allocation.
-      // This leads to nicer tests.
-      ContFrame->replaceAllUsesWith(PoisonValue::get(ContFrame->getType()));
-    }
-
-    // Handle the function returns.
-    // Treat returns and existing jumps separately, since otherwise we could accidentally free.
-    // returns originate from coro passes, indicating functions ending at suspend points, while
-    // lgc.cps.complete ends the lane. Leave existing jumps to resume functions as they are.
-
-    // We want to free the continuation stack when we end the original shader with a jump (a), but not at jumps that
-    // correspond to a suspend point (b). This collects the already existing jumps (a) into the PreExistingJumps vector.
-    // The jumps that correspond to a suspend point, (b), are introduced when lowering existing return instructions.
-    // To avoid that we accidentally iterate over these newly introduced jumps, we keep the existing rets (which will be
-    // translated to (b)) and existing jumps (a) separately. Before this pass, ret instructions mark a suspend point.
-    // However, after this shader, ret instructions mark the end of the thread. Finally, we have lgc.cps.complete, which
-    // is used to mark the lane termination, e. g. the end of RGS. These are translated to ret instructions as part of
-    // this pass.
-    // Note: Technically, it is not required to free the CPS stack at complete calls, but for consistency reasons, we do
-    // it anyway.
-    SmallVector<ReturnInst *> PreExistingRets;
-    SmallVector<lgc::cps::JumpOp *> PreExistingJumps;
-    for (auto &BB : make_early_inc_range(*NewFunc)) {
-      auto *I = BB.getTerminator();
-      if (I->getOpcode() == Instruction::Ret) {
-        PreExistingRets.push_back(cast<ReturnInst>(I));
-      } else if (I->getOpcode() == Instruction::Unreachable && BB.size() > 1) {
-        CallInst *PrevInst = cast<CallInst>(&*(--I->getIterator()));
-        if (auto *Jump = dyn_cast<lgc::cps::JumpOp>(PrevInst)) {
-          PreExistingJumps.push_back(Jump);
-          continue;
-        }
-
-        // Transform a lane-terminating lgc.cps.complete into a ret instruction.
-        // If this a non-terminating lgc.cps.jump, this will just free the stack.
-        if (isa<lgc::cps::CompleteOp>(PrevInst)) {
-          B.SetInsertPoint(PrevInst);
-
-          uint32_t NeededStackSize = FuncData.getContStateStackBytes();
-          if (NeededStackSize > 0)
-            B.create<lgc::cps::FreeOp>(B.getInt32(NeededStackSize));
-        } else {
-          LLVM_DEBUG(PrevInst->dump());
-          llvm_unreachable("Unexpected instruction!");
-        }
-      }
-    }
-
-    // First, handle the pre-existing jumps, (a).
-    for (auto *Jump : PreExistingJumps) {
-      B.SetInsertPoint(Jump);
-
-      uint32_t NeededStackSize = FuncData.getContStateStackBytes();
-      if (NeededStackSize > 0)
-        B.create<lgc::cps::FreeOp>(B.getInt32(NeededStackSize));
-    }
-
-    // Then, insert the new jumps for pre-existing returns / suspend points, (b).
-    for (auto *Ret : PreExistingRets)
-      handleContinue(FuncData, Ret);
-
-    for (auto *I : InstsToRemove)
-      I->eraseFromParent();
-
-    // Remove the old function
-    F->replaceAllUsesWith(ConstantExpr::getBitCast(NewFunc, F->getType()));
-    ToRemove.push_back(F);
-
-    // Update metadata
-    assert(ContMDTuple != nullptr);
-    NewFunc->setMetadata(ContHelper::MDContinuationName, ContMDTuple);
-  }
-
-  // Register count analysis needs to wait until all functions have been
-  // processed above, turning rets into continuation.[wait]continue calls.
-  for (auto [NewFunc, IsStart] : NewFuncs) {
-    if (!IsStart) {
-      uint32_t IncomingRegisterCount = getIncomingRegisterCount(NewFunc);
-      ContHelper::IncomingRegisterCount::setValue(NewFunc, IncomingRegisterCount);
-    }
-  }
-
-  for (auto *F : ToRemove)
-    F->eraseFromParent();
-}
-
-void LegacyCleanupContinuationsPassImpl::handleFunctionEntry(ContinuationData &Data, Function *F, bool IsEntry) {
-  uint64_t NeededStackSize = Data.getContStateStackBytes();
-  bool IsStart = F == Data.NewStart;
-
-  if (IsStart) {
-    // Add function metadata that stores how big the continuation state is in
-    // bytes
-    // Technically, continuation state includes the spilled payload here.
-    // However, we want to exclude it here for statistics.
-    uint32_t PayloadSpillSize = ContHelper::StackSize::tryGetValue(F).value_or(0);
-    assert(Data.ContStateBytes >= PayloadSpillSize);
-    ContHelper::ContinuationStateByteCount::setValue(F, Data.ContStateBytes - PayloadSpillSize);
-  }
-
-  if (NeededStackSize) {
-    Value *ContStateOnStack = nullptr;
-    if (IsStart) {
-      ContHelper::StackSize::setValue(F, NeededStackSize);
-
-      ContStateOnStack = B.create<lgc::cps::AllocOp>(B.getInt32(NeededStackSize));
-    } else {
-      ContStateOnStack = B.create<lgc::cps::PeekOp>(B.getInt32(NeededStackSize));
-    }
-
-    ContStateOnStack->setName("cont.state.stack.segment");
-
-    uint64_t ContStateNumI32s = divideCeil(Data.ContStateBytes, RegisterBytes);
-    auto *ContStateTy = ArrayType::get(I32, ContStateNumI32s);
-
-    // Peek into CSP stack to obtain continuation state.
-    // This can be handled in the same way for start and resume functions,
-    // because for start functions we already allocated space above.
-    Data.NewContState =
-        B.CreateBitCast(ContStateOnStack, ContStateTy->getPointerTo(lgc::cps::stackAddrSpace), "cont.state");
-  }
-}
-
-/// Transform
-///   %tok = call %continuation.token* @foo() !continuation.registercount !0
-///   %0 = insertvalue { i8*, %continuation.token* } { i8* bitcast ({ i8*,
-///     %continuation.token* } (i8*, i1)* @fun.resume.0 to i8*),
-///     %continuation.token* undef }, %continuation.token* %tok, 1
-///   ret { i8*, %continuation.token* } %0
-/// to
-///   %resume_addr = ptrtoint i8* ... @fun.resume.0 to i64
-///   %foo = ptrtoint %continuation.token* () @foo to i64
-///   call void @lgc.ilcps.continue(i64 %foo, i64
-///     %resume_addr, <foo args>) !continuation.registercount !0
-///   unreachable
-///
-/// Also handles cases where the token and resume function are behind a phi.
-void LegacyCleanupContinuationsPassImpl::handleContinue(ContinuationData &Data, Instruction *Ret) {
-  // Find the function call that generates the token
-  LLVM_DEBUG(dbgs() << "Converting ret to continue: " << *Ret << "\nArgument: " << *Ret->getOperand(0) << "\n");
-  auto *BB = Ret->getParent();
-  SmallVector<Instruction *> ToRemove;
-  ToRemove.push_back(Ret);
-  auto Calls = findTokenOrigin(Ret->getParent(), Ret->getOperand(0), ToRemove);
-
-  for (auto *I : ToRemove)
-    I->eraseFromParent();
-
-  for (auto &Entry : Calls) {
-    LLVM_DEBUG(dbgs() << "Handling call: " << *Entry.second.first << " with resume function " << Entry.second.second
-                      << "\n");
-    auto *Call = Entry.second.first;
-    auto *ResumeFun = Entry.second.second;
-    handleSingleContinue(Data, Call, ResumeFun);
-  }
-
-  if (BB->empty()) {
-    assert(BB->hasNPredecessorsOrMore(0) && "Handled all continues but the block still has predecessors left");
-    BB->eraseFromParent();
-  }
-}
-
-void LegacyCleanupContinuationsPassImpl::handleSingleContinue(ContinuationData &Data, CallInst *Call,
-                                                              Value *ResumeFun) {
-  // Pass resume address as argument
-  B.SetInsertPoint(Call);
-
-  auto *ContinuationReference = B.create<lgc::cps::AsContinuationReferenceOp>(I64, ResumeFun);
-
-  Value *JumpAddr = B.CreatePointerCast(Call->getCalledOperand(), I64);
-  SmallVector<Value *> TailArgs{Call->args()};
-
-  CallInst *Jump = B.create<lgc::cps::JumpOp>(JumpAddr, -1, PoisonValue::get(StructType::get(B.getContext())),
-                                              ContinuationReference, TailArgs);
-
-  Jump->copyMetadata(*Call);
-
-  assert(ContHelper::OutgoingRegisterCount::tryGetValue(Jump) && "Missing registercount metadata!");
-
-  // Remove instructions at the end of the block
-  auto *Unreachable = B.CreateUnreachable();
-  for (auto &I : make_early_inc_range(reverse(*Jump->getParent()))) {
-    if (&I == Unreachable)
-      break;
-    I.eraseFromParent();
-  }
-}
-
-LegacyCleanupContinuationsPassImpl::LegacyCleanupContinuationsPassImpl(llvm::Module &Mod,
-                                                                       llvm::ModuleAnalysisManager &AnalysisManager)
-    : M{Mod}, Context{M.getContext()},
-      FAM{AnalysisManager.getResult<FunctionAnalysisManagerModuleProxy>(Mod).getManager()}, B{Context} {
-  AnalysisManager.getResult<DialectContextAnalysis>(M);
-  ContMalloc = M.getFunction("continuation.malloc");
-  ContFree = M.getFunction("continuation.free");
-}
-
-PreservedAnalyses LegacyCleanupContinuationsPassImpl::run() {
-  bool Changed = false;
-
-  // Map the entry function of a continuation to the analysis result
-  for (auto &F : M.functions()) {
-    if (F.empty())
-      continue;
-    if (auto *MD = F.getMetadata(ContHelper::MDContinuationName)) {
-      analyzeContinuation(F, MD);
-    } else {
-      auto ShaderStage = lgc::rt::getLgcRtShaderStage(&F);
-      if (ShaderStage == lgc::rt::RayTracingShaderStage::Traversal ||
-          ShaderStage == lgc::rt::RayTracingShaderStage::KernelEntry) {
-        Changed = true;
-        // Add !continuation metadata to KernelEntry and Traversal after
-        // coroutine passes. The traversal loop is written as like the coroutine
-        // passes were applied manually.
-        MDTuple *ContMDTuple = MDTuple::get(Context, {ValueAsMetadata::get(&F)});
-        F.setMetadata(ContHelper::MDContinuationName, ContMDTuple);
-      }
-    }
-  }
-
-  // Check if the continuation state is used in any function part
-  for (auto &FuncData : ToProcess) {
-    finalizeContinuationData(*FuncData.first, FuncData.second);
-  }
-
-  Changed |= !ToProcess.empty();
-
-  if (!ToProcess.empty()) {
-    I32 = Type::getInt32Ty(Context);
-    I64 = Type::getInt64Ty(Context);
-
-    for (auto &FuncData : ToProcess) {
-      processContinuation(FuncData.first, FuncData.second);
-    }
-
-    fixupDxilMetadata(M);
-  }
-
-  Changed |= lowerCompleteOp(M);
-
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-} // namespace
-
-llvm::PreservedAnalyses LegacyCleanupContinuationsPass::run(llvm::Module &Mod,
-                                                            llvm::ModuleAnalysisManager &AnalysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the cleanup-continuations pass\n");
-  AnalysisManager.getResult<DialectContextAnalysis>(Mod);
-  LegacyCleanupContinuationsPassImpl Impl(Mod, AnalysisManager);
-  return Impl.run();
-}
diff --git a/llvmraytracing/lib/LowerAwait.cpp b/llvmraytracing/lib/LowerAwait.cpp
index dfae9f65f6..5bdc56294a 100644
--- a/llvmraytracing/lib/LowerAwait.cpp
+++ b/llvmraytracing/lib/LowerAwait.cpp
@@ -59,40 +59,14 @@ class LowerAwaitPassImpl final {
   Module &Mod;
   MapVector<Function *, SmallVector<CallInst *>> ToProcess;
   void collectContinuationFunctions();
-  void processContinuations(bool IsLgcCpsMode);
+  void processContinuations();
 };
 } // anonymous namespace
 
-Function *llvm::getContinuationAwait(Module &M, Type *TokenTy, StructType *RetTy) {
-  std::string Name = "await";
-  auto &C = M.getContext();
-  auto *AwaitTy = FunctionType::get(RetTy, TokenTy, false);
-  auto *AwaitFun = Function::Create(AwaitTy, GlobalValue::LinkageTypes::ExternalLinkage, Name, &M);
-  AwaitFun->setAttributes(
-      AttributeList::get(C, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::WillReturn}));
-  return AwaitFun;
-}
-
 LowerAwaitPassImpl::LowerAwaitPassImpl(Module &Mod) : Mod{Mod} {
 }
 
-void LowerAwaitPassImpl::collectContinuationFunctions() {
-  for (auto &F : Mod.functions()) {
-    if (!F.getName().starts_with("await")) {
-      // Force processing annotated functions, even if they don't have await
-      // calls
-      if (F.hasMetadata(ContHelper::MDContinuationName))
-        ToProcess.insert({&F, {}});
-      continue;
-    }
-    for (auto *U : F.users()) {
-      if (auto *Inst = dyn_cast<CallInst>(U))
-        ToProcess[Inst->getFunction()].push_back(Inst);
-    }
-  }
-}
-
-void LowerAwaitPassImpl::processContinuations(bool IsLgcCpsMode) {
+void LowerAwaitPassImpl::processContinuations() {
   // We definitely have a call that requires continuation in this function
   //
   // If this is the first time we've done this for this function
@@ -195,22 +169,19 @@ void LowerAwaitPassImpl::processContinuations(bool IsLgcCpsMode) {
     for (auto *CI : FuncData.second) {
       B.SetInsertPoint(CI);
       Value *SuspendRetconArg = nullptr;
-      if (IsLgcCpsMode) {
-        SmallVector<Value *> Args;
-        SmallVector<Type *> ArgTys;
-        for (Value *Arg : CI->args()) {
-          Args.push_back(Arg);
-          ArgTys.push_back(Arg->getType());
-        }
-
-        // Insert a dummy call to remember the arguments to lgc.cps.await.
-        auto *ShaderTy = FunctionType::get(TokenTy, ArgTys, false);
-        auto *ShaderFun = B.CreateIntToPtr(CI->getArgOperand(0), ShaderTy->getPointerTo());
-        SuspendRetconArg = B.CreateCall(ShaderTy, ShaderFun, Args);
-        cast<CallInst>(SuspendRetconArg)->copyMetadata(*CI);
-      } else {
-        SuspendRetconArg = CI->getArgOperand(0);
+      SmallVector<Value *> Args;
+      SmallVector<Type *> ArgTys;
+      for (Value *Arg : CI->args()) {
+        Args.push_back(Arg);
+        ArgTys.push_back(Arg->getType());
       }
+
+      // Insert a dummy call to remember the arguments to lgc.cps.await.
+      auto *ShaderTy = FunctionType::get(TokenTy, ArgTys, false);
+      auto *ShaderFun = B.CreateIntToPtr(CI->getArgOperand(0), ShaderTy->getPointerTo());
+      SuspendRetconArg = B.CreateCall(ShaderTy, ShaderFun, Args);
+      cast<CallInst>(SuspendRetconArg)->copyMetadata(*CI);
+
       B.CreateIntrinsic(Intrinsic::coro_suspend_retcon, {B.getInt1Ty()}, SuspendRetconArg);
       auto *RetTy = CI->getType();
       if (!RetTy->isVoidTy()) {
@@ -225,24 +196,26 @@ void LowerAwaitPassImpl::processContinuations(bool IsLgcCpsMode) {
 PreservedAnalyses LowerAwaitPassImpl::run() {
   struct VisitorPayload {
     LowerAwaitPassImpl &Self;
-    bool HasCpsAwaitCalls = false;
   };
 
   static auto Visitor = llvm_dialects::VisitorBuilder<VisitorPayload>()
                             .add<lgc::cps::AwaitOp>([](VisitorPayload &Payload, auto &Op) {
                               Payload.Self.ToProcess[Op.getFunction()].push_back(&Op);
-                              Payload.HasCpsAwaitCalls = true;
                             })
                             .build();
 
   VisitorPayload P{*this};
   Visitor.visit(P, Mod);
 
-  collectContinuationFunctions();
+  for (auto &F : Mod) {
+    // Force processing annotated functions, even if they don't have await
+    // calls
+    if (F.hasMetadata(ContHelper::MDContinuationName))
+      ToProcess.insert({&F, {}});
+  }
 
   if (!ToProcess.empty()) {
-    bool IsLgcCpsMode = P.HasCpsAwaitCalls || ContHelper::isLgcCpsModule(Mod);
-    processContinuations(IsLgcCpsMode);
+    processContinuations();
     fixupDxilMetadata(Mod);
     return PreservedAnalyses::none();
   }
diff --git a/llvmraytracing/lib/LowerRaytracingPipeline.cpp b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
index ac041a24ca..0c4116fd01 100644
--- a/llvmraytracing/lib/LowerRaytracingPipeline.cpp
+++ b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
@@ -82,21 +82,6 @@ using namespace lgc::rt;
 
 namespace {
 
-// Create a GEP if I is non-null, otherwise return the pointer.
-static Value *SimplifyingCreateConstGEP1_32(IRBuilder<> &B, Type *Ty, Value *Ptr, uint32_t I) {
-  // A GEP with a single zero index is redundant with opaque pointers
-  if (I == 0)
-    return Ptr;
-  return B.CreateConstGEP1_32(Ty, Ptr, I);
-}
-
-static Value *SimplifyingCreateConstInBoundsGEP1_32(IRBuilder<> &B, Type *Ty, Value *Ptr, uint32_t I) {
-  // A GEP with a single zero index is redundant with opaque pointers
-  if (I == 0)
-    return Ptr;
-  return B.CreateConstInBoundsGEP1_32(Ty, Ptr, I);
-}
-
 // Helper struct to avoid recursively passing these arguments
 struct PayloadCopyHelper {
   Module &M;
@@ -175,14 +160,15 @@ struct PayloadCopyHelper {
     if (CompleteInterval.Begin < PayloadRegisterCount) {
       PAQIndexInterval Interval = {CompleteInterval.Begin, std::min(CompleteInterval.End, PayloadRegisterCount)};
       // Pointer to start of current interval in global payload
-      auto *GlobalIntervalI32Ptr = SimplifyingCreateConstInBoundsGEP1_32(B, I32, Serialization, Interval.Begin);
+      auto *GlobalIntervalI32Ptr =
+          CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(B, I32, Serialization, Interval.Begin);
       TmpIntervals.push_back({Interval, GlobalIntervalI32Ptr});
     }
     if (CompleteInterval.End > PayloadRegisterCount) {
       PAQIndexInterval Interval = {std::max(CompleteInterval.Begin, PayloadRegisterCount), CompleteInterval.End};
       // Pointer to start of current interval in global payload
-      auto *GlobalIntervalI32Ptr =
-          SimplifyingCreateConstInBoundsGEP1_32(B, I32, SpilledPayloadPtr, Interval.Begin - PayloadRegisterCount);
+      auto *GlobalIntervalI32Ptr = CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(
+          B, I32, SpilledPayloadPtr, Interval.Begin - PayloadRegisterCount);
       TmpIntervals.push_back({Interval, GlobalIntervalI32Ptr});
     }
 
@@ -193,7 +179,8 @@ struct PayloadCopyHelper {
       unsigned FieldI32Offset = *FieldByteOffset / RegisterBytes;
       assert(*FieldByteOffset == FieldI32Offset * RegisterBytes);
       // I32 pointer into field, offset by FieldI32Offset
-      auto *FieldIntervalI32Ptr = SimplifyingCreateConstInBoundsGEP1_32(B, I32, LocalFieldPtr, FieldI32Offset);
+      auto *FieldIntervalI32Ptr =
+          CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(B, I32, LocalFieldPtr, FieldI32Offset);
 
       // Determine Src and Dst
       auto *Src = FieldIntervalI32Ptr;
@@ -291,6 +278,10 @@ class LowerRaytracingPipelinePassImpl final {
     SmallVector<CallInst *> ShaderRecordBufferCalls;
     SmallVector<JumpOp *> JumpCalls;
 
+    /// In any-hit shaders, map known return instructions to their exit kind
+    /// for delayed hit attribute processing.
+    DenseMap<ReturnInst *, AnyHitExitKind> AnyHitExits;
+
     /// Pointer to the alloca'd system data object in this function
     AllocaInst *SystemData = nullptr;
     StructType *SystemDataTy = nullptr;
@@ -330,7 +321,7 @@ class LowerRaytracingPipelinePassImpl final {
 
   /// Needed data for handling the end of a function
   struct FunctionEndData {
-    Instruction *Terminator = nullptr;
+    ReturnInst *Terminator = nullptr;
     const PAQSerializationLayout *OutgoingSerializationLayout = nullptr;
     SmallVector<Value *> SavedRegisterValues;
     Value *NewPayload = nullptr;
@@ -501,6 +492,7 @@ class LowerRaytracingPipelinePassImpl final {
 
   void handleGetShaderKind(Function &Func);
   void handleGetCurrentFuncAddr(Function &Func);
+  void handleGetShaderRecIndex(Function &Func);
 
   void handleAmdInternalFunc(Function &Func);
 
@@ -509,6 +501,7 @@ class LowerRaytracingPipelinePassImpl final {
   void handleUnrematerializableCandidates();
 
   void collectGpuRtFunctions();
+  void determineDispatchSystemDataType();
 
   // Computes an upper bound on the number of required payload registers
   // for a TraceRay call, based on module-wide max attribute and payload size.
@@ -558,6 +551,7 @@ class LowerRaytracingPipelinePassImpl final {
                          bool GlobalToLocal, const PAQSerializationLayout *Layout);
   void processContinuations();
   void processFunctionEntry(FunctionData &Data, Argument *SystemDataArgument);
+  void prepareAnyHitExits(Function *F, FunctionData &Data);
   void processFunctionEnd(FunctionData &Data, FunctionEndData &EData);
   void processFunction(Function *F, FunctionData &FuncData);
   void handleContPayloadRegisterI32Count(Function &F);
@@ -654,12 +648,6 @@ CallInst *LowerRaytracingPipelinePassImpl::insertCpsAwait(Type *ReturnTy, Value
                                                           RayTracingShaderStage ShaderStage) {
   Builder.SetInsertPoint(Call);
 
-  Value *CR = nullptr;
-  if (ShaderAddr->getType()->getIntegerBitWidth() == 64)
-    CR = Builder.CreateTrunc(ShaderAddr, Type::getInt32Ty(Mod->getContext()));
-  else
-    CR = ShaderAddr;
-
   RayTracingShaderStage CallStage = RayTracingShaderStage::Count;
   if (CallType == ContinuationCallType::Traversal)
     CallStage = RayTracingShaderStage::Traversal;
@@ -671,7 +659,8 @@ CallInst *LowerRaytracingPipelinePassImpl::insertCpsAwait(Type *ReturnTy, Value
   assert(CallStage != RayTracingShaderStage::Count && "LowerRaytracingPipelinePassImpl::insertCpsAwait: Invalid "
                                                       "call stage before inserting lgc.cps.await operation!");
 
-  return Builder.create<AwaitOp>(ReturnTy, CR, 1 << static_cast<uint8_t>(getCpsLevelForShaderStage(CallStage)), Args);
+  return Builder.create<AwaitOp>(ReturnTy, Builder.CreateTrunc(ShaderAddr, RcrTy),
+                                 1 << static_cast<uint8_t>(getCpsLevelForShaderStage(CallStage)), Args);
 }
 
 Function *llvm::getSetLocalRootIndex(Module &M) {
@@ -850,9 +839,8 @@ void LowerRaytracingPipelinePassImpl::replaceReportHitCall(FunctionData &Data, C
   Instruction *Then = SplitBlockAndInsertIfThen(IsEnd, &*Builder.GetInsertPoint(), true);
   Builder.SetInsertPoint(Then);
 
-  FunctionEndData EData;
-  EData.Terminator = Then;
-  processFunctionEnd(Data, EData);
+  Builder.CreateRetVoid();
+  Then->eraseFromParent();
 }
 
 /// Replace a call to Await with a call to a given address and pass generated
@@ -978,9 +966,6 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
     Args.push_back(HitAttrs);
   }
 
-  CallInst *Annotatable = nullptr;
-  Value *NewCall = nullptr;
-
   uint32_t OutgoingPayloadDwords = 0;
   if (Data.NumPassedThroughPayloadDwords.has_value()) {
     OutgoingPayloadDwords = Data.NumPassedThroughPayloadDwords.value();
@@ -1007,61 +992,44 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
   if (IsWait)
     WaitMask = Call->getArgOperand(1);
 
-  if (IsLgcCpsMode) {
-    if (HasPayload) {
-      // Compute padding for the resume function so that payload starts at a
-      // fixed dword. NOTE: Minus 1 as in lgc.cps mode, shader index (i32) is not included.
-      PayloadHelper.computePaddingAndPayloadArgTys(ReturnedArgTys, ReturnedRegisterCount.value(),
-                                                   Data.FirstPayloadArgumentDword, 1);
-    }
-
-    auto *NewRetTy = StructType::get(Builder.getContext(), ReturnedArgTys);
-
-    Annotatable = insertCpsAwait(NewRetTy, ShaderAddr, Call, Args, CallType, Data.Kind);
-
-    NewCall = Annotatable;
-  } else {
-    // The wait mask isn't part of regular arguments and thus shouldn't be
-    // considered for padding. Thus, we first compute padding, and then add the
-    // wait mask.
-
+  uint32_t PaddingOffset = 1;
+  if (!IsLgcCpsMode) {
+    // Compute padding for the resume function so that payload starts at a
+    // fixed dword. NOTE: Minus 1 as in lgc.cps mode, shader index (i32) is not included.
+    PaddingOffset = 0;
     // Patch the return address into the await call, since it got excluded for
     // the padding computation previously. For WaitAwaitTraversal, this needs to
     // be removed later once we have the TraversalEntry function.
-    ArgTys.insert(ArgTys.begin(), RetAddr->getType());
     Args.insert(Args.begin(), RetAddr);
+  }
 
-    auto *ShaderTy = FunctionType::get(TokenTy, ArgTys, false);
-    auto *ShaderFun = Builder.CreateIntToPtr(ShaderAddr, ShaderTy->getPointerTo());
+  if (HasPayload) {
+    PayloadHelper.computePaddingAndPayloadArgTys(ReturnedArgTys, ReturnedRegisterCount.value(),
+                                                 Data.FirstPayloadArgumentDword, PaddingOffset);
+  }
 
-    auto *Token = Builder.CreateCall(ShaderTy, ShaderFun, Args);
+  auto *NewRetTy = StructType::get(Builder.getContext(), ReturnedArgTys);
 
-    if (HasPayload) {
-      PayloadHelper.computePaddingAndPayloadArgTys(ReturnedArgTys, ReturnedRegisterCount.value(),
-                                                   Data.FirstPayloadArgumentDword);
-    }
+  auto *NewCall = insertCpsAwait(NewRetTy, ShaderAddr, Call, Args, CallType, Data.Kind);
 
-    auto *NewRetTy = StructType::get(Builder.getContext(), ReturnedArgTys);
-    auto *Await = getContinuationAwait(*Mod, TokenTy, NewRetTy);
-    NewCall = Builder.CreateCall(Await, {Token});
-    Annotatable = Token;
+  if (WaitMask) {
+    // The only supported wait mask is a constant -1.
+    assert(cast<ConstantInt>(WaitMask)->getSExtValue() == -1);
+    ContHelper::setWaitMask(*NewCall);
   }
 
-  if (WaitMask)
-    ContHelper::setWaitMask(*Annotatable, cast<ConstantInt>(WaitMask)->getSExtValue());
-
   // Copy back returned payload to the payload serialization alloca as part of
   // the payload copying.
   if (HasPayload)
     Builder.CreateStore(Builder.CreateExtractValue(NewCall, ReturnedArgTys.size() - 1), Data.PayloadStorage);
 
-  ContHelper::ReturnedRegisterCount::setValue(Annotatable, ReturnedRegisterCount.value());
+  ContHelper::ReturnedRegisterCount::setValue(NewCall, ReturnedRegisterCount.value());
 
   auto OutgoingRegisterCount = std::min(OutgoingSerializationLayout ? OutgoingSerializationLayout->NumStorageI32s
                                                                     : MetadataState.getMaxPayloadRegisterCount(),
                                         MetadataState.getMaxPayloadRegisterCount());
   // Annotate call with the number of registers used for payload
-  ContHelper::OutgoingRegisterCount::setValue(Annotatable, OutgoingRegisterCount);
+  ContHelper::OutgoingRegisterCount::setValue(NewCall, OutgoingRegisterCount);
   if (OutgoingSerializationLayout) {
     MetadataState.updateMaxUsedPayloadRegisterCount(OutgoingRegisterCount);
     MetadataState.updateMaxUsedPayloadRegisterCount(ReturnedRegisterCount.value());
@@ -1071,7 +1039,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
     // Copy global payload back to local payload
     // Overwrite the local payload with poison first, to make sure it is not
     // seen as live state.
-    Builder.CreateStore(PoisonValue::get(PayloadOrAttrsTy), PayloadOrAttrs);
+    Builder.CreateStore(Builder.CreateFreeze(PoisonValue::get(PayloadOrAttrsTy)), PayloadOrAttrs);
 
     if (CallType == ContinuationCallType::CallShader) {
       // For CallShader, there is only a single layout
@@ -1087,8 +1055,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy
   if (!Call->getType()->isVoidTy()) {
     // Extract the system data from the { %systemData, %padding, %payload }
     // struct returned by the await call.
-    NewCall = Builder.CreateExtractValue(NewCall, 0);
-    Call->replaceAllUsesWith(NewCall);
+    Call->replaceAllUsesWith(Builder.CreateExtractValue(NewCall, 0));
   }
 
   Call->eraseFromParent();
@@ -1192,8 +1159,8 @@ void llvm::copyBytes(IRBuilder<> &B, Value *Dst, Value *Src, uint64_t NumBytes)
   uint64_t NumFullI32s = NumBytes / RegisterBytes;
   // Copy full I32s
   for (uint64_t I32Index = 0; I32Index < NumFullI32s; ++I32Index) {
-    auto *DstPtr = SimplifyingCreateConstInBoundsGEP1_32(B, I32, Dst, I32Index);
-    auto *SrcPtr = SimplifyingCreateConstInBoundsGEP1_32(B, I32, Src, I32Index);
+    auto *DstPtr = CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(B, I32, Dst, I32Index);
+    auto *SrcPtr = CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(B, I32, Src, I32Index);
     auto *Val = B.CreateLoad(I32, SrcPtr);
     B.CreateStore(Val, DstPtr);
   }
@@ -1206,8 +1173,8 @@ void llvm::copyBytes(IRBuilder<> &B, Value *Dst, Value *Src, uint64_t NumBytes)
   // Create i8 loads and stores for the remaining bytes
   Type *I8 = B.getIntNTy(8);
   for (uint64_t I8Index = NumFullI32s * RegisterBytes; I8Index < NumBytes; ++I8Index) {
-    auto *DstPtr = SimplifyingCreateConstGEP1_32(B, I8, Dst, I8Index);
-    auto *SrcPtr = SimplifyingCreateConstGEP1_32(B, I8, Src, I8Index);
+    auto *DstPtr = CompilerUtils::simplifyingCreateConstGEP1_32(B, I8, Dst, I8Index);
+    auto *SrcPtr = CompilerUtils::simplifyingCreateConstGEP1_32(B, I8, Src, I8Index);
     auto *Val = B.CreateLoad(I8, SrcPtr);
     B.CreateStore(Val, DstPtr);
   }
@@ -1227,8 +1194,8 @@ void LowerRaytracingPipelinePassImpl::copyPayload(Type &PayloadTy, Value *LocalP
 
   Value *SpilledPayloadPtr = nullptr;
   if (Layout.PayloadMemPointerNode) {
-    auto *SpillPtr = SimplifyingCreateConstInBoundsGEP1_32(Builder, Builder.getInt8Ty(), PayloadStorage,
-                                                           FirstPayloadMemoryPointerRegister);
+    auto *SpillPtr = CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, Builder.getInt8Ty(), PayloadStorage,
+                                                                          FirstPayloadMemoryPointerRegister);
     SpilledPayloadPtr = Builder.CreateLoad(Builder.getPtrTy(lgc::cps::stackAddrSpace), SpillPtr);
   }
 
@@ -1290,7 +1257,7 @@ void LowerRaytracingPipelinePassImpl::savePayloadRegistersBeforeRecursion(
     for (const PAQIndexInterval &Interval : StorageInfo.IndexIntervals) {
       for (unsigned I = Interval.Begin; I < std::min(Interval.End, MetadataState.getMaxPayloadRegisterCount()); ++I) {
         // Create backup of the I-th payload register
-        auto *LoadPtr = SimplifyingCreateConstGEP1_32(Builder, I32, PayloadStorage, I);
+        auto *LoadPtr = CompilerUtils::simplifyingCreateConstGEP1_32(Builder, I32, PayloadStorage, I);
         auto *OldValue = Builder.CreateLoad(RegTy, LoadPtr);
         // As long as we keep a 32 bit alignment of all fields, all fields
         // get disjoint registers, and we should never save a register twice.
@@ -1314,7 +1281,7 @@ void LowerRaytracingPipelinePassImpl::restorePayloadRegistersAfterRecursion(
   for (unsigned I = 0; I < SavedRegisterValues.size(); ++I) {
     Value *OldValue = SavedRegisterValues[I];
     if (OldValue) {
-      auto *StorePtr = SimplifyingCreateConstGEP1_32(Builder, I32, PayloadStorage, I);
+      auto *StorePtr = CompilerUtils::simplifyingCreateConstGEP1_32(Builder, I32, PayloadStorage, I);
       Builder.CreateStore(SavedRegisterValues[I], StorePtr);
     }
   }
@@ -1381,8 +1348,8 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
     // Assume maximum possible size
     PayloadHitAttrBytes = MetadataState.getMaxHitAttributeByteCount() - InlineHitAttrsBytes;
     // Use hit attribute storage at fixed index
-    PayloadHitAttrs =
-        SimplifyingCreateConstGEP1_32(Builder, I32, Data.PayloadStorage, FirstPayloadHitAttributeStorageRegister);
+    PayloadHitAttrs = CompilerUtils::simplifyingCreateConstGEP1_32(Builder, I32, Data.PayloadStorage,
+                                                                   FirstPayloadHitAttributeStorageRegister);
   }
 
   uint64_t HitAttrsBytes = DL->getTypeStoreSize(Data.HitAttributes).getFixedValue();
@@ -1392,12 +1359,13 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
   LocalHitAttributes = Builder.CreateBitCast(LocalHitAttributes, RegTyPtr);
   auto *I8Ty = Builder.getInt8Ty();
   for (unsigned I = 0; I < divideCeil(HitAttrsBytes, RegisterBytes); I++) {
-    auto *LocalPtr = SimplifyingCreateConstInBoundsGEP1_32(Builder, RegTy, LocalHitAttributes, I);
+    auto *LocalPtr = CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, RegTy, LocalHitAttributes, I);
     Value *GlobalPtr;
     if (I < InlineRegSize)
-      GlobalPtr = SimplifyingCreateConstInBoundsGEP1_32(Builder, RegTy, InlineHitAttrs, I);
+      GlobalPtr = CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, RegTy, InlineHitAttrs, I);
     else
-      GlobalPtr = SimplifyingCreateConstInBoundsGEP1_32(Builder, RegTy, PayloadHitAttrs, I - InlineRegSize);
+      GlobalPtr =
+          CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, RegTy, PayloadHitAttrs, I - InlineRegSize);
 
     auto *LoadPtr = GlobalToLocal ? GlobalPtr : LocalPtr;
     auto *StorePtr = GlobalToLocal ? LocalPtr : GlobalPtr;
@@ -1410,8 +1378,9 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu
       auto *ByteLoadPtr = Builder.CreateBitCast(LoadPtr, I8Ty->getPointerTo());
       auto *ByteStorePtr = Builder.CreateBitCast(StorePtr, I8Ty->getPointerTo());
       for (unsigned J = 0; J < HitAttrsBytes % RegisterBytes; J++) {
-        auto *Val = Builder.CreateLoad(I8Ty, SimplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, ByteLoadPtr, J));
-        Builder.CreateStore(Val, SimplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, ByteStorePtr, J));
+        auto *Val = Builder.CreateLoad(
+            I8Ty, CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, ByteLoadPtr, J));
+        Builder.CreateStore(Val, CompilerUtils::simplifyingCreateConstInBoundsGEP1_32(Builder, I8Ty, ByteStorePtr, J));
       }
     }
   }
@@ -1522,24 +1491,74 @@ void LowerRaytracingPipelinePassImpl::processFunctionEntry(FunctionData &Data, A
   }
 }
 
-void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, FunctionEndData &EData) {
-  AnyHitExitKind AHExitKind = AnyHitExitKind::None;
-  bool IsAnyHit = Data.Kind == RayTracingShaderStage::AnyHit;
+// Lower lgc.rt.{accept.hit.and.end.search,ignore.hit} intrinsics and insert the default accept hit calls.
+void LowerRaytracingPipelinePassImpl::prepareAnyHitExits(Function *F, FunctionData &Data) {
+  // First, collect default accept returns.
+  SmallVector<ReturnInst *> AcceptReturns;
+  for (BasicBlock &BB : *F) {
+    if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      if (Ret != &*BB.begin() && isa<AcceptHitAndEndSearchOp, IgnoreHitOp>(Ret->getPrevNode()))
+        continue;
+
+      AcceptReturns.push_back(Ret);
+    }
+  }
+
+  // Now insert the accept hit calls. This adds new basic blocks, so we do it in a separate loop.
+  for (auto *Ret : AcceptReturns) {
+    Builder.SetInsertPoint(Ret);
+    assert(AcceptHit && "Could not find AcceptHit function");
+    auto *SystemDataTy = cast<StructType>(getFuncArgPtrElementType(AcceptHit, 0));
+    auto *SystemData = getDXILSystemData(Builder, Data.SystemData, Data.SystemDataTy, SystemDataTy);
+    CrossInliner.inlineCall(Builder, AcceptHit, SystemData);
+
+    Data.AnyHitExits.try_emplace(Ret, AnyHitExitKind::AcceptHit);
+  }
+
+  // Now collect and do the initial lowering of the intrinsics.
+  SmallVector<CallInst *> IntrinsicReturns;
+  static const auto Visitor =
+      llvm_dialects::VisitorBuilder<SmallVector<CallInst *>>()
+          .addSet<AcceptHitAndEndSearchOp, IgnoreHitOp>(
+              [](SmallVector<CallInst *> &List, Instruction &I) { List.push_back(cast<CallInst>(&I)); })
+          .build();
+
+  Visitor.visit(IntrinsicReturns, *F);
+
+  for (CallInst *I : IntrinsicReturns) {
+    // First, ensure that the next instruction is a return.
+    Instruction *Next = I->getNextNode();
+    ReturnInst *Ret = dyn_cast<ReturnInst>(Next);
+    if (!Ret) {
+      // unreachable should be a common next instruction since these ops are noreturn.
+      // If we don't have that, split the block -- everything after the intrinsic
+      // will become unreachable.
+      if (!isa<UnreachableInst>(Next)) {
+        BasicBlock *NewBB = I->getParent()->splitBasicBlockBefore(Next);
+        NewBB->takeName(Next->getParent());
+        Next = NewBB->getTerminator();
+      }
 
-  if (IsAnyHit) {
-    // Default to AcceptHit, which is only implicitly represented by
-    // the absence of a call to the other intrinsics.
-    AHExitKind = AnyHitExitKind::AcceptHit;
-    // Search backwards from the terminator to find a call to one of
-    // acceptHitAndEndSearch or ignoreHit.
-    if (EData.Terminator != EData.Terminator->getParent()->getFirstNonPHI()) {
-      auto Before = --EData.Terminator->getIterator();
-      if (isa<AcceptHitAndEndSearchOp>(Before))
-        AHExitKind = AnyHitExitKind::AcceptHitAndEndSearch;
-      else if (isa<IgnoreHitOp>(Before))
-        AHExitKind = AnyHitExitKind::IgnoreHit;
+      Builder.SetInsertPoint(Next);
+      Ret = Builder.CreateRetVoid();
+      Next->eraseFromParent();
     }
+
+    [[maybe_unused]] bool Inserted =
+        Data.AnyHitExits
+            .try_emplace(Ret, isa<AcceptHitAndEndSearchOp>(I) ? AnyHitExitKind::AcceptHitAndEndSearch
+                                                              : AnyHitExitKind::IgnoreHit)
+            .second;
+    assert(Inserted);
+
+    // Now replace the intrinsic
+    replaceIntrinsicCall(Builder, Data.SystemDataTy, Data.SystemData, Data.Kind, I, GpurtLibrary, CrossInliner);
   }
+}
+
+void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, FunctionEndData &EData) {
+  AnyHitExitKind AHExitKind = AnyHitExitKind::None;
+  bool IsAnyHit = Data.Kind == RayTracingShaderStage::AnyHit;
 
   Builder.SetInsertPoint(EData.Terminator);
 
@@ -1549,13 +1568,9 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, Fun
     assert(PayloadTy && "Missing payload type!");
 
     if (IsAnyHit) {
-      if (AHExitKind == AnyHitExitKind::AcceptHit) {
-        // Add a call to AcceptHit
-        assert(AcceptHit && "Could not find AcceptHit function");
-        auto *SystemDataTy = cast<StructType>(getFuncArgPtrElementType(AcceptHit, 0));
-        auto *SystemData = getDXILSystemData(Builder, Data.SystemData, Data.SystemDataTy, SystemDataTy);
-        CrossInliner.inlineCall(Builder, AcceptHit, SystemData);
-      }
+      auto It = Data.AnyHitExits.find(EData.Terminator);
+      assert(It != Data.AnyHitExits.end());
+      AHExitKind = It->second;
 
       EData.OutgoingSerializationLayout = &PAQManager.getOrCreateShaderExitSerializationLayout(
           *Data.IncomingPayloadSerializationInfo, Data.Kind, Data.HitAttributes, AHExitKind);
@@ -1642,7 +1657,7 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, Fun
 
   Instruction *Jump =
       Builder.create<lgc::cps::JumpOp>(ReturnAddr, Levels, PoisonValue::get(StructType::get(Builder.getContext())),
-                                       PoisonValue::get(RcrTy), TailArgList);
+                                       PoisonValue::get(I32), PoisonValue::get(RcrTy), TailArgList);
   Builder.CreateUnreachable();
   EData.Terminator->eraseFromParent();
 
@@ -2031,24 +2046,12 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
   }
   Data.ReturnTy = NewRetTy;
 
-  // Modify function ends
-  // While iterating over function ends, basic blocks are inserted by inlining
-  // functions, so we copy them beforehand.
+  if (Data.Kind == RayTracingShaderStage::AnyHit)
+    prepareAnyHitExits(NewFunc, Data);
+
   if (Data.Kind == RayTracingShaderStage::Traversal) {
     PayloadHelper.patchJumpCalls(NewFunc, Data.JumpCalls, Data.FirstPayloadArgumentDword,
                                  Data.NumPassedThroughPayloadDwords, Data.PayloadStorage);
-  } else {
-    SmallVector<BasicBlock *> BBs(make_pointer_range(*NewFunc));
-    for (auto *BB : BBs) {
-      auto *I = BB->getTerminator();
-      assert(I && "BB must have terminator");
-      // Replace the end of the BB if it terminates the function
-      bool IsFunctionEnd = (I->getOpcode() == Instruction::Ret || I->getOpcode() == Instruction::Unreachable);
-      if (IsFunctionEnd) {
-        EData.Terminator = I;
-        processFunctionEnd(Data, EData);
-      }
-    }
   }
 
   // Remove the old function
@@ -2093,6 +2096,23 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData
   for (auto *Call : Data.IntrinsicCalls)
     replaceIntrinsicCall(Builder, Data.SystemDataTy, Data.SystemData, Data.Kind, Call, GpurtLibrary, CrossInliner);
 
+  // Modify function ends
+  // We do this close to the end because ReportHit handling can insert new returns.
+  if (Data.Kind != RayTracingShaderStage::Traversal) {
+    // While iterating over function ends, basic blocks are inserted by inlining
+    // functions, so we copy them beforehand.
+    SmallVector<BasicBlock *> BBs(make_pointer_range(*NewFunc));
+    for (auto *BB : BBs) {
+      auto *I = BB->getTerminator();
+      assert(I && "BB must have terminator");
+      // Replace the end of the BB if it terminates the function
+      if (auto *Ret = dyn_cast<ReturnInst>(I)) {
+        EData.Terminator = Ret;
+        processFunctionEnd(Data, EData);
+      }
+    }
+  }
+
 #ifndef NDEBUG
   if (!MetadataState.isInLgcCpsMode() && Data.Kind != RayTracingShaderStage::RayGeneration) {
     // Check that all returns have registercount metadata
@@ -2254,12 +2274,12 @@ void LowerRaytracingPipelinePassImpl::splitRestoreBB() {
 // Search for known intrinsics that cannot be rematerialized
 void LowerRaytracingPipelinePassImpl::handleUnrematerializableCandidates() {
   for (auto &Func : *Mod) {
-    if (!llvm::isLgcRtOp(&Func))
+    if (!lgc::rt::LgcRtDialect::isDialectOp(Func))
       continue;
 
     static const llvm_dialects::OpSet NonRematerializableDialectOps =
         llvm_dialects::OpSet::get<TraceRayOp, ReportHitOp, CallCallableShaderOp, ShaderIndexOp, ShaderRecordBufferOp,
-                                  JumpOp>();
+                                  JumpOp, AcceptHitAndEndSearchOp, IgnoreHitOp>();
     if (!NonRematerializableDialectOps.contains(Func)) {
       llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
         auto Data = ToProcess.find(CInst.getFunction());
@@ -2363,6 +2383,17 @@ void LowerRaytracingPipelinePassImpl::collectGpuRtFunctions() {
   });
 }
 
+void LowerRaytracingPipelinePassImpl::determineDispatchSystemDataType() {
+  Function *DispatchRaysIndex = GpurtLibrary->getFunction("_cont_DispatchRaysIndex3");
+  assert(DispatchRaysIndex &&
+         "LowerRaytracingPipelinePassImpl::determineDispatchSystemDataType: Could not find _cont_DispatchRaysIndex3!");
+
+  DispatchSystemDataTy = getFuncArgPtrElementType(DispatchRaysIndex, 0);
+  assert(DispatchSystemDataTy && "LowerRaytracingPipelinePassImpl::determineDispatchSystemDataType: Could "
+                                 "not derive DispatchSystemData "
+                                 "type from _cont_DispatchRaysIndex3!");
+}
+
 LowerRaytracingPipelinePassImpl::LowerRaytracingPipelinePassImpl(llvm::Module &M, Module &GpurtLibrary)
     : Mod{&M}, GpurtLibrary{&GpurtLibrary}, Context{&M.getContext()}, DL{&M.getDataLayout()},
       Builder{Mod->getContext()}, MetadataState{*Mod},
@@ -2372,10 +2403,7 @@ LowerRaytracingPipelinePassImpl::LowerRaytracingPipelinePassImpl(llvm::Module &M
 
 PreservedAnalyses LowerRaytracingPipelinePassImpl::run() {
   collectGpuRtFunctions();
-  DispatchSystemDataTy = getFuncArgPtrElementType(GetLocalRootIndex, 0);
-  assert(DispatchSystemDataTy && "LowerRaytracingPipelinePassImpl::run: Could "
-                                 "not derive DispatchSystemData "
-                                 "type from GetLocalRootIndex!");
+  determineDispatchSystemDataType();
 
   collectProcessableFunctions();
 
diff --git a/llvmraytracing/lib/PassRegistry.inc b/llvmraytracing/lib/PassRegistry.inc
index 848a30181f..fa421a67eb 100644
--- a/llvmraytracing/lib/PassRegistry.inc
+++ b/llvmraytracing/lib/PassRegistry.inc
@@ -46,20 +46,20 @@
 
 CONT_MODULE_ANALYSIS("dialect-context-analysis", DialectContextAnalysis(NeedDialectContext))
 
-CONT_MODULE_PASS("legacy-cleanup-continuations", LegacyCleanupContinuationsPass())
-CONT_MODULE_PASS("dxil-cleanup-continuations", DXILCleanupContinuationsPass())
 CONT_MODULE_PASS("cleanup-continuations", CleanupContinuationsPass())
+CONT_MODULE_PASS("continuations-lint", ContinuationsLintPass())
 CONT_MODULE_PASS("continuations-stats-report", ContinuationsStatsReportPass())
-CONT_MODULE_PASS("lower-raytracing-pipeline", LowerRaytracingPipelinePass())
-CONT_MODULE_PASS("lgc-cps-jump-inliner", LgcCpsJumpInlinerPass())
+CONT_MODULE_PASS("dxil-cleanup-continuations", DXILCleanupContinuationsPass())
 CONT_MODULE_PASS("dxil-cont-intrinsic-prepare", DXILContIntrinsicPreparePass())
 CONT_MODULE_PASS("dxil-cont-lgc-rt-op-converter", DXILContLgcRtOpConverterPass())
-CONT_MODULE_PASS("dxil-cont-post-process", DXILContPostProcessPass())
-CONT_MODULE_PASS("continuations-lint", ContinuationsLintPass())
 CONT_MODULE_PASS("dxil-cont-post-hook", DXILContPostHookPass())
+CONT_MODULE_PASS("dxil-cont-post-process", DXILContPostProcessPass())
 CONT_MODULE_PASS("dxil-cont-pre-hook", DXILContPreHookPass())
+CONT_MODULE_PASS("lgc-cps-jump-inliner", LgcCpsJumpInlinerPass())
 CONT_MODULE_PASS("lower-await", LowerAwaitPass())
+CONT_MODULE_PASS("lower-raytracing-pipeline", LowerRaytracingPipelinePass())
 CONT_MODULE_PASS("remove-types-metadata", RemoveTypesMetadataPass())
+CONT_MODULE_PASS("specialize-driver-shaders", SpecializeDriverShadersPass())
 
 CONT_CGSCC_PASS("dxil-coro-split", DXILCoroSplitPass())
 CONT_CGSCC_PASS("lgc-coro-split", LgcCoroSplitPass())
diff --git a/llvmraytracing/lib/PipelineState.cpp b/llvmraytracing/lib/PipelineState.cpp
index ffbda3b41d..b69c0d4f89 100644
--- a/llvmraytracing/lib/PipelineState.cpp
+++ b/llvmraytracing/lib/PipelineState.cpp
@@ -39,10 +39,11 @@ namespace {
 // Constants used in the msgpack format
 namespace MsgPackFormat {
 
-constexpr unsigned MajorVersion = 1;
+constexpr unsigned MajorVersion = 2;
 
 static constexpr char Version[] = "version";
 static constexpr char MaxUsedPayloadRegisterCount[] = "max_used_payload_register_count";
+static constexpr char SpecializeDriverShadersState[] = "specialize_driver_shaders_state";
 
 } // namespace MsgPackFormat
 } // anonymous namespace
@@ -65,6 +66,12 @@ Expected<PipelineState> PipelineState::decodeMsgpack(llvm::msgpack::DocNode &Roo
   PipelineState State = {};
   GetUInt(Node[MsgPackFormat::MaxUsedPayloadRegisterCount], State.MaxUsedPayloadRegisterCount);
 
+  auto &SDSNode = Node[MsgPackFormat::SpecializeDriverShadersState];
+  auto SDSStateOrErr = SpecializeDriverShadersState::decodeMsgpack(SDSNode);
+  if (auto Err = SDSStateOrErr.takeError())
+    return Err;
+  State.SDSState = *SDSStateOrErr;
+
   return State;
 }
 
@@ -82,6 +89,7 @@ void PipelineState::encodeMsgpack(llvm::msgpack::DocNode &Root) const {
   auto &Node = Root.getMap(true);
   Node[MsgPackFormat::Version] = MsgPackFormat::MajorVersion;
   Node[MsgPackFormat::MaxUsedPayloadRegisterCount] = MaxUsedPayloadRegisterCount;
+  SDSState.encodeMsgpack(Node[MsgPackFormat::SpecializeDriverShadersState]);
 }
 
 std::string PipelineState::encodeMsgpack() const {
@@ -100,6 +108,10 @@ llvm::Expected<PipelineState> PipelineState::fromModuleMetadata(const llvm::Modu
   auto OptMaxUsedPayloadRegCount = ContHelper::tryGetMaxUsedPayloadRegisterCount(M);
   if (OptMaxUsedPayloadRegCount.has_value())
     State.MaxUsedPayloadRegisterCount = *OptMaxUsedPayloadRegCount;
+  auto SDSStateOrErr = SpecializeDriverShadersState::fromModuleMetadata(M);
+  if (auto Err = SDSStateOrErr.takeError())
+    return Err;
+  State.SDSState = *SDSStateOrErr;
   return State;
 }
 
@@ -107,10 +119,12 @@ void PipelineState::exportModuleMetadata(llvm::Module &M) const {
   if (MaxUsedPayloadRegisterCount) {
     ContHelper::setMaxUsedPayloadRegisterCount(M, MaxUsedPayloadRegisterCount);
   }
+  SDSState.exportModuleMetadata(M);
 }
 
 void PipelineState::merge(const PipelineState &Other) {
   MaxUsedPayloadRegisterCount = std::max(MaxUsedPayloadRegisterCount, Other.MaxUsedPayloadRegisterCount);
+  SDSState.merge(Other.SDSState);
 }
 
 } // namespace llvmraytracing
diff --git a/llvmraytracing/lib/SpecializeDriverShaders.cpp b/llvmraytracing/lib/SpecializeDriverShaders.cpp
new file mode 100644
index 0000000000..16d1ac7a5e
--- /dev/null
+++ b/llvmraytracing/lib/SpecializeDriverShaders.cpp
@@ -0,0 +1,1321 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- SpecializeDriverShaders.cpp - Specialize driver shaders based on full-pipeline knowledge -------------------===//
+
+#include "llvmraytracing/SpecializeDriverShaders.h"
+#include "compilerutils/CompilerUtils.h"
+#include "compilerutils/ValueOriginTracking.h"
+#include "compilerutils/ValueSpecialization.h"
+#include "llvmraytracing/ContinuationsUtil.h"
+#include "lgc/LgcCpsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
+#include "lgc/LgcRtDialect.h"
+#include "llvm-dialects/Dialect/Visitor.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/IR/Module.h"
+#include <cassert>
+
+using namespace llvm;
+using namespace CompilerUtils;
+
+#define DEBUG_TYPE "specialize-driver-shaders"
+// Normal debug output that is also used in testing is wrapped in LLVM_DEBUG
+// which can be enabled with --debug arguments.
+//
+// Even more detailed debug output is wrapped in DETAIL_DEBUG which can be enabled by changing EnableDetailDebugOutput.
+// This can be useful when debugging, for instance why a particular argument slot was not detected as preserved.
+static constexpr bool EnableDetailDebugOutput = false;
+#define DETAIL_DEBUG(BODY)                                                                                             \
+  LLVM_DEBUG({                                                                                                         \
+    if (EnableDetailDebugOutput) {                                                                                     \
+      BODY;                                                                                                            \
+    }                                                                                                                  \
+  })
+
+namespace {
+
+namespace MsgPackFormat {
+
+constexpr unsigned MajorVersion = 1;
+
+static constexpr char Version[] = "version";
+static constexpr char TraversalArgsInfo[] = "traversal_args_info";
+
+} // namespace MsgPackFormat
+
+namespace MetadataFormat {
+
+// For metadata, we don't need to safeguard against version mismatches,
+// as metadata is only used temporarily within modules and not stored to disk,
+// so every metadata we deserialize has been serialized by the same version of ourselves.
+//
+// We use an `lgc.rt` prefix even though this is not officially part of the lgc.rt dialect to indicate this is part
+// of llvmraytracing. It is however private metadata of this pass and not accessed elsewhere.
+static constexpr char State[] = "lgc.rt.specialize.driver.shaders.state";
+static constexpr char Options[] = "lgc.rt.specialize.driver.shaders.opts";
+
+} // namespace MetadataFormat
+
+namespace MDHelper {
+std::optional<uint32_t> extractZExtI32Constant(Metadata *MD) {
+  if (MD) {
+    uint64_t Result = mdconst::extract<ConstantInt>(MD)->getZExtValue();
+    assert(Result <= std::numeric_limits<uint32_t>::max());
+    return Result;
+  }
+  return std::nullopt;
+}
+
+Metadata *getI32MDConstant(LLVMContext &Context, uint32_t Value) {
+  IntegerType *Int32Ty = Type::getInt32Ty(Context);
+  Metadata *Result = ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Value));
+  assert(Result && "Failed to create metadata node!");
+  assert(extractZExtI32Constant(Result) == Value && "Failed to extract value from node!");
+  return Result;
+}
+
+} // namespace MDHelper
+
+// Utilities to keep track of the "status" of individual arg slots.
+// There is some similarity between these pairs of types:
+//  * ArgSlotStatus and ValueTracking::SliceStatus
+//  * ArgSlotInfo and ValueTracking::SliceInfo
+//  * ArgSlotsInfo and ValueTracking::ValueInfo
+//
+// The main difference is due to the notion of "Preserved" arguments,
+// which doesn't make sense for general values, and due to the fact
+// that we don't care about the contents of (non-preserved) dynamic arguments.
+// Also, we don't support bitmasks of multiple possible status, and instead
+// treat multi-status cases conservatively.
+enum class ArgSlotStatus : uint32_t {
+  Dynamic = 0,   // The arg slot is set to an unknown value and does not preserve the corresponding incoming arg slot.
+  Constant,      // The arg slot is set to a known constant
+  UndefOrPoison, // The arg slot is undef or poison
+  Preserve,      // The arg slot preserves the corresponding incoming arg slot.
+                 //  Only used for in-Traversal functions, like Traversal or AHS,
+                 //  but not for jumps from non-Traversal functions to Traversal functions (e.g. TraceRay call sites).
+  Count
+};
+
+StringRef toString(ArgSlotStatus AS, bool Compact = false) {
+  switch (AS) {
+  case ArgSlotStatus::Dynamic:
+    return Compact ? "D" : "Dynamic";
+  case ArgSlotStatus::Constant:
+    return Compact ? "C" : "Constant";
+  case ArgSlotStatus::UndefOrPoison:
+    return Compact ? "U" : "UndefOrPoison";
+  case ArgSlotStatus::Preserve:
+    return Compact ? "P" : "Preserve";
+  default:
+    break;
+  }
+  report_fatal_error("Unexpected value " + Twine(static_cast<int>(AS)));
+}
+
+[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const ArgSlotStatus &AS) {
+  OS << toString(AS);
+  return OS;
+}
+
+// This is stored for every incoming arg slot and every function we'd like to specialize.
+struct ArgSlotInfo {
+  ArgSlotStatus Status = ArgSlotStatus::Dynamic;
+  uint32_t ConstantValue = 0;
+
+  void print(llvm::raw_ostream &OS, bool Compact = false) const {
+    OS << toString(Status, Compact);
+    if (!Compact && Status == ArgSlotStatus::Constant) {
+      OS << "=0x";
+      OS.write_hex(ConstantValue);
+    }
+  }
+
+  static ArgSlotInfo combine(const ArgSlotInfo &LHS, const ArgSlotInfo &RHS) {
+    if (LHS.Status == ArgSlotStatus::Preserve)
+      return RHS;
+    if (RHS.Status == ArgSlotStatus::Preserve)
+      return LHS;
+
+    if (LHS.Status == ArgSlotStatus::Dynamic || RHS.Status == ArgSlotStatus::Dynamic)
+      return {ArgSlotStatus::Dynamic};
+
+    // Both are undef or constant. Merge Undef + constant -> constant
+    // If we wanted to treat poison/undef as constant zero instead, this is the place we'd need to change.
+    if (LHS.Status == ArgSlotStatus::UndefOrPoison)
+      return RHS;
+    if (RHS.Status == ArgSlotStatus::UndefOrPoison)
+      return LHS;
+
+    assert(LHS.Status == ArgSlotStatus::Constant && RHS.Status == ArgSlotStatus::Constant);
+    if (LHS.ConstantValue == RHS.ConstantValue)
+      return LHS;
+
+    return {ArgSlotStatus::Dynamic};
+  }
+
+  bool operator==(const ArgSlotInfo &Other) const {
+    return std::tie(Status, ConstantValue) == std::tie(Other.Status, Other.ConstantValue);
+  }
+  bool operator!=(const ArgSlotInfo &Other) const { return !(*this == Other); }
+};
+
+[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const ArgSlotInfo &AS) {
+  AS.print(OS);
+  return OS;
+}
+
+// Infos about all relevant arg slots of a function.
+struct ArgSlotsInfo {
+  SmallVector<ArgSlotInfo> ArgSlots;
+
+  static llvm::Expected<ArgSlotsInfo> decodeMsgpack(llvm::msgpack::DocNode &Node) {
+    // Format: Array of Status/ConstantValue pairs.
+    auto &ArrNode = Node.getArray();
+    if (ArrNode.size() % 2)
+      return make_error<StringError>("expected even array length", inconvertibleErrorCode());
+
+    ArgSlotsInfo Result{};
+    Result.ArgSlots.resize(ArrNode.size() / 2);
+    for (unsigned Idx = 0; Idx < Result.ArgSlots.size(); ++Idx) {
+      auto &StatusNode = ArrNode[2 * Idx];
+      auto &ConstantValueNode = ArrNode[2 * Idx + 1];
+      if (StatusNode.isEmpty() || ConstantValueNode.isEmpty())
+        return make_error<StringError>("unexpected empty nodes", inconvertibleErrorCode());
+      ArgSlotStatus Status = static_cast<ArgSlotStatus>(StatusNode.getUInt());
+      if (Status >= ArgSlotStatus::Count)
+        return make_error<StringError>("invalid status", inconvertibleErrorCode());
+      uint32_t ConstantValue = ConstantValueNode.getUInt();
+      Result.ArgSlots[Idx] = {Status, ConstantValue};
+    }
+    return Result;
+  }
+
+  void encodeMsgpack(llvm::msgpack::DocNode &Node) const {
+    auto &ArrNode = Node.getArray(true);
+    unsigned Idx = 0;
+    for (const ArgSlotInfo &ASI : ArgSlots) {
+      // Serialize ArgSlotInfo using two 32-bit values: The first one gives
+      // the status, the second one the constant (if there is one)
+      ArrNode[Idx++] = static_cast<uint32_t>(ASI.Status);
+      ArrNode[Idx++] = ASI.ConstantValue;
+    }
+  }
+
+  static llvm::Expected<ArgSlotsInfo> fromMetadata(const llvm::MDNode *MD) {
+    unsigned NumMDOperands = MD->getNumOperands();
+    if (NumMDOperands % 2)
+      return make_error<StringError>("expected even array length", inconvertibleErrorCode());
+    unsigned NumArgs = NumMDOperands / 2;
+    ArgSlotsInfo Result{};
+    Result.ArgSlots.resize(NumArgs);
+    for (unsigned Idx = 0; Idx < NumArgs; ++Idx) {
+      auto OptStatus = MDHelper::extractZExtI32Constant(MD->getOperand(2 * Idx));
+      auto OptConstantValue = MDHelper::extractZExtI32Constant(MD->getOperand(2 * Idx + 1));
+      if (!OptStatus.has_value() || !OptConstantValue.has_value())
+        return make_error<StringError>("unexpected missing values", inconvertibleErrorCode());
+
+      ArgSlotStatus Status = static_cast<ArgSlotStatus>(*OptStatus);
+      if (Status >= ArgSlotStatus::Count)
+        return make_error<StringError>("invalid status", inconvertibleErrorCode());
+      Result.ArgSlots[Idx] = {Status, *OptConstantValue};
+    }
+    return Result;
+  }
+
+  llvm::MDNode *exportAsMetadata(LLVMContext &Context) const {
+    SmallVector<Metadata *> Entries;
+    unsigned NumEntries = 2 * ArgSlots.size();
+    Entries.reserve(NumEntries);
+    for (const ArgSlotInfo &ASI : ArgSlots) {
+      // Serialize ArgSlotInfo using two 32-bit values: The first one gives
+      // the status, the second one the constant (if there is one)
+      Entries.push_back(MDHelper::getI32MDConstant(Context, static_cast<uint32_t>(ASI.Status)));
+      Entries.push_back(MDHelper::getI32MDConstant(Context, ASI.ConstantValue));
+    }
+    return MDTuple::get(Context, Entries);
+  }
+
+  static ArgSlotsInfo combine(const ArgSlotsInfo &LHS, const ArgSlotsInfo &RHS) {
+    ArgSlotsInfo Result;
+    // Canonicalize which one is the larger one, this simplifies the combine logic
+    const ArgSlotsInfo *SmallInfo = &LHS;
+    const ArgSlotsInfo *LargeInfo = &RHS;
+    if (SmallInfo->ArgSlots.size() > LargeInfo->ArgSlots.size())
+      std::swap(SmallInfo, LargeInfo);
+
+    Result.ArgSlots.reserve(LargeInfo->ArgSlots.size());
+
+    for (unsigned ArgIdx = 0; ArgIdx < LargeInfo->ArgSlots.size(); ++ArgIdx) {
+      if (ArgIdx < SmallInfo->ArgSlots.size())
+        Result.ArgSlots.push_back(ArgSlotInfo::combine(SmallInfo->ArgSlots[ArgIdx], LargeInfo->ArgSlots[ArgIdx]));
+      else
+        Result.ArgSlots.push_back(LargeInfo->ArgSlots[ArgIdx]);
+    }
+
+    return Result;
+  }
+
+  void print(llvm::raw_ostream &OS, bool Compact = false) const {
+    for (const auto &[Idx, ASI] : enumerate(ArgSlots)) {
+      if (!Compact && Idx)
+        OS << "; ";
+      ASI.print(OS, Compact);
+    }
+  }
+
+  // Prints a compact output, together with table headers indicating argument slot indices, like this:
+  // <Indent>0         1         2
+  // <Indent>012345678901234567890
+  // <Indent>DDDDPCCDDDDDDPPDDDDDD
+  void printTable(llvm::raw_ostream &OS, StringRef Indent) const {
+    OS << Indent;
+    if (ArgSlots.empty()) {
+      OS << "<empty>\n";
+      return;
+    }
+    for (unsigned Idx = 0; Idx < ArgSlots.size(); ++Idx) {
+      if (Idx % 10 == 0)
+        OS << (Idx / 10) % 10;
+      else
+        OS << ' ';
+    }
+    OS << '\n' << Indent;
+    for (unsigned Idx = 0; Idx < ArgSlots.size(); ++Idx)
+      OS << Idx % 10;
+    OS << '\n' << Indent;
+    print(OS, true);
+    OS << '\n';
+  }
+
+  bool operator==(const ArgSlotsInfo &Other) const { return ArgSlots == Other.ArgSlots; }
+  bool operator!=(const ArgSlotsInfo &Other) const { return !(*this == Other); }
+};
+
+[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const ArgSlotsInfo &AI) {
+  AI.print(OS);
+  return OS;
+}
+
+// This is a simplified version ValueTracking::SliceInfo, specialized for the purpose of detecting
+// preserved argument slot dwords. It stores a value it refers to, and a byte offset into that value.
+//
+// For every incoming argument slot, we create a DwordOriginInfo that points to the corresponding dword
+// in the incoming argument.
+// For every outgoing argument slot, we use value origin tracking to determine whether it in fact matches
+// the corresponding incoming argument slot dword.
+struct ValueWithOffset {
+  Value *Val = nullptr;
+  unsigned ByteOffset = -1;
+  bool operator==(const ValueWithOffset &Other) const {
+    return std::tie(Val, ByteOffset) == std::tie(Other.Val, Other.ByteOffset);
+  }
+  bool operator!=(const ValueWithOffset &Other) const { return !(*this == Other); }
+};
+
+struct IncomingArgSlotValuesWithOffsets {
+  // Indexed by dword arg slot. For every incoming arg slot of a function, points into the scalar corresponding to
+  // that argument slot within the argument containing the arg slot.
+  // If an argument slot doesn't correspond to a full aligned dword within the containing argument type, then the value
+  // of this arg slot is set to nullptr instead to indicate that we can't detect preservation of this arg slot.
+  SmallVector<ValueWithOffset> ArgSlots;
+  // For awaits during Traversal (e.g. ReportHit), we want to also allow preserving the awaited result instead of
+  // incoming arguments.
+  // We do this by telling the ValueOriginTracker to assume await results to equal corresponding incoming function args.
+  // This is a mapping of awaited results to incoming arguments that can be passed to the value origin tracker
+  // so it understands this assumption.
+  // Use std::optional so we can safely move from this value and reset the optional, catching unintended accesses.
+  std::optional<ValueOriginTracker::ValueOriginAssumptions> AwaitOriginAssumptions =
+      ValueOriginTracker::ValueOriginAssumptions{};
+};
+
+// Info for a single arg slot as part of ArgumentLayoutInfo
+class ArgumentLayoutSlotInfo {
+public:
+  ArgumentLayoutSlotInfo(unsigned ByteOffset, unsigned NumBytes)
+      : ByteOffset{ByteOffset}, CoversAlignedDword{(ByteOffset % 4 == 0) && (NumBytes == 4)} {}
+  // For the value corresponding to the arg slot within the containing type, stores the corresponding byte offset into
+  // the as-in-memory layout of the type. For instance, given Ty = { i32, i64 }, and assuming i64 is 64-bit aligned,
+  // then Ty occupies three arg slots at offsets 0, 8 and 12 into the type. The dword at offset 4 is padding and does
+  // not have a corresponding arg slot.
+  unsigned ByteOffset;
+  // If the ByteOffset is not dword aligned, then we cannot keep track of this value with value tracking which uses
+  // dword slices. Also, if the offset is dword aligned, but the value doesn't cover the whole dword, we as well
+  // can't prove the value to be preserved, as we can't tell whether the whole value is preserved, or just a prefix.
+  //
+  // We currently handle small types that don't cover full dwords (e.g. i16) conservatively.
+  // Some cases, e.g. just forwarding a single i16, are currently considered as dynamic where in fact
+  // we could consider them as preserve, because only non-poison outgoing bits are relevant for the analysis.
+  // However, other cases where incoming high implicit poison bits are populated may not be treated as preserve.
+  // For instance, consider an incoming <2 x i16> %arg argument that covers two argument slots, but the type is a single
+  // dword large. If the function bitcasts the argument to an i32 and passes that i32 to an outgoing argument slot,
+  // value origin analysis on the i32 might conclude that it originates from a matching incoming argument slot
+  // (value %arg, offset 0), and thus can be considered as preserve, missing the fact that the high 16 bits of the
+  // argument slot were previously poison. These poison bits are not present in the <2 x i16> argument type.
+  //
+  // As long as we don't expect i16s in arguments, we thus keep the analysis simpler by handling i16s conservatively.
+  bool CoversAlignedDword;
+};
+
+// Describes how a type is laid out in in-register argument slots.
+class ArgumentLayoutInfo {
+public:
+  SmallVector<ArgumentLayoutSlotInfo> SlotInfos;
+
+  unsigned numArgumentSlots() const { return SlotInfos.size(); }
+
+  static ArgumentLayoutInfo get(Type *Ty, const DataLayout &DL) {
+    ArgumentLayoutInfo Result{};
+    populateRecursively(Ty, DL, Result, 0);
+    return Result;
+  }
+
+private:
+  // Recursively populate Result, assuming a (possibly nested) value of the given type at the given byte offset.
+  static void populateRecursively(Type *Ty, const DataLayout &DL, ArgumentLayoutInfo &Result,
+                                  unsigned AccumByteOffset) {
+    // Detect how many arg slots we added, and at the end assert that it matches the expectation
+    [[maybe_unused]] unsigned PrevNumArgSlots = Result.numArgumentSlots();
+    if (auto *STy = dyn_cast<StructType>(Ty)) {
+      const auto &SL = DL.getStructLayout(STy);
+      for (unsigned ElemIdx = 0; ElemIdx < STy->getNumElements(); ++ElemIdx) {
+        auto *ElemTy = STy->getElementType(ElemIdx);
+        unsigned ByteOffset = SL->getElementOffset(ElemIdx);
+        populateRecursively(ElemTy, DL, Result, AccumByteOffset + ByteOffset);
+      }
+    } else if (isa<VectorType>(Ty)) {
+      // We don't support nor expect non-fixed vector types
+      auto *VecTy = cast<FixedVectorType>(Ty);
+      // Vectors are always bit-packed without padding.
+      //
+      // We support all vectors of element types with a byte-aligned size.
+      // Element sizes do not have to be dword-aligned for this function to correctly
+      // compute an argument layout info. However non-dword aligned elements might be handled
+      // conservatively by the following analysis.
+      //
+      // We don't support vectors whose element types are not byte-aligned, as below code used byte-based offsets.
+      // Such vectors should not be passed in arguments. If we realled need to support them in the future,
+      // one possibility would be populating explicitly invalidated argument layout infos.
+      Type *ElemTy = VecTy->getElementType();
+      unsigned NumElemBits = DL.getTypeSizeInBits(ElemTy);
+      assert(NumElemBits % 8 == 0);
+      unsigned NumElemBytes = NumElemBits / 8;
+      unsigned NumElemDwords = divideCeil(NumElemBytes, 4);
+      unsigned NumElems = VecTy->getNumElements();
+      for (unsigned ElemIdx = 0; ElemIdx < NumElems; ++ElemIdx) {
+        unsigned NumRemainingBytes = NumElemBytes;
+        for (unsigned DwordIdx = 0; DwordIdx < NumElemDwords; ++DwordIdx) {
+          unsigned NumSlotBytes = std::min(4u, NumRemainingBytes);
+          NumRemainingBytes -= NumSlotBytes;
+          Result.SlotInfos.emplace_back(AccumByteOffset + 4 * DwordIdx, NumSlotBytes);
+        }
+        AccumByteOffset += NumElemBytes;
+      }
+    } else if (auto *ArrTy = dyn_cast<ArrayType>(Ty)) {
+      Type *ElemTy = ArrTy->getElementType();
+      unsigned NumElems = ArrTy->getNumElements();
+      unsigned ElemStrideInBytes = DL.getTypeAllocSize(ElemTy).getFixedValue();
+      for (unsigned ElemIdx = 0; ElemIdx < NumElems; ++ElemIdx)
+        populateRecursively(ElemTy, DL, Result, AccumByteOffset + ElemIdx * ElemStrideInBytes);
+    } else {
+      assert(Ty->isSingleValueType());
+      // Pointers, integers, floats
+      unsigned NumBits = DL.getTypeSizeInBits(Ty);
+      assert(NumBits % 8 == 0);
+      unsigned NumBytes = NumBits / 8;
+      unsigned NumDwords = divideCeil(NumBytes, 4);
+      unsigned NumRemainingBytes = NumBytes;
+      for (unsigned DwordIdx = 0; DwordIdx < NumDwords; ++DwordIdx) {
+        unsigned NumSlotBytes = std::min(4u, NumRemainingBytes);
+        NumRemainingBytes -= NumSlotBytes;
+        Result.SlotInfos.emplace_back(AccumByteOffset + DwordIdx * 4, NumSlotBytes);
+      }
+    }
+    [[maybe_unused]] unsigned NumAddedArgSlots = Result.numArgumentSlots() - PrevNumArgSlots;
+    assert(NumAddedArgSlots == lgc::cps::getArgumentDwordCount(DL, Ty));
+  }
+};
+
+// Stores an outgoing jump, together with the first outgoing argument that should be considered.
+struct JumpInfo {
+  CallInst *Outgoing = nullptr;
+  unsigned FirstRelevantOutgoingArgIdx = 0;
+};
+
+struct AwaitInfo : public JumpInfo {
+  // For awaits, we handle both lgc.cps.await and legacy awaits.
+  // lgc.cps uses a single await call, like:
+  //   %result = call @lgc.cps.await(i32 %target, i32 %levels, args...)
+  // legacy mode uses *two* calls, first invoking target, and then awaiting the result:
+  //   %handle = call ptr inttoptr (i32 %target to ptr)(args...)
+  //   %result = call @await(ptr %handle)
+  // For legacy awaits, this is the second call that obtains the result value.
+  // For lgc.cps.await, it is the unique await call.
+  CallInst *AwaitedResult = nullptr;
+};
+
+struct FunctionData {
+  lgc::rt::RayTracingShaderStage Stage = lgc::rt::RayTracingShaderStage::Count;
+  bool IsDuringTraversal = false;
+  SmallVector<JumpInfo> Jumps;
+  SmallVector<AwaitInfo> Awaits;
+};
+
+struct SpecializeDriverShadersPassImpl {
+public:
+  static constexpr unsigned ArgSlotSizeInBytes = 4;
+  static constexpr unsigned MaxNumAnalyzedArgSlots = 256;
+
+  Module &M;
+  const DataLayout &DL;
+  SpecializeDriverShadersOptions Opts;
+  ArgSlotsInfo &TraversalArgsInfo;
+  // If TraversalArgsInfo is trivial when starting the pass, meaning there was no metadata that
+  // we could serialize from, conservatively do not optimize, because it could mean that
+  // the pipeline compiler is not merging and propagating cross-module state.
+  bool HadNonTrivialIncomingTraversalArgsInfo = true;
+  MapVector<Function *, FunctionData> ToProcess;
+  // We usually have only one, but supporting more is trivial and helps testing.
+  SmallVector<Function *> TraversalFunctions;
+  Type *I32 = nullptr;
+  // When considering incoming function args to be preserved/specialized, ignore this many arguments.
+  unsigned FirstRelevantIncomingArgIdx = -1;
+  unsigned FirstRelevantOutgoingJumpArgIdx = -1;
+  // Cache for per-type ArgumentLayoutInfos. unique_ptr for stable storage as DenseMap may invalidate iterators.
+  DenseMap<Type *, std::unique_ptr<ArgumentLayoutInfo>> ArgLayoutInfos;
+
+  SpecializeDriverShadersPassImpl(Module &M, ArgSlotsInfo &TraversalArgsInfo,
+                                  const SpecializeDriverShadersOptions &Opts)
+      : M{M}, DL{M.getDataLayout()}, Opts{Opts}, TraversalArgsInfo{TraversalArgsInfo}, I32{Type::getInt32Ty(
+                                                                                           M.getContext())} {
+    HadNonTrivialIncomingTraversalArgsInfo = !TraversalArgsInfo.ArgSlots.empty();
+    if (ContHelper::isLgcCpsModule(M)) {
+      // Ignore cont state, return addr, shaderRecIdx
+      FirstRelevantIncomingArgIdx = 3;
+      // Ignore: shaderAddr, levels, state, csp, returnAddr, shaderRecIdx
+      FirstRelevantOutgoingJumpArgIdx = 6;
+    } else {
+      // Ignore returnAddr
+      FirstRelevantIncomingArgIdx = 1;
+      // Ignore: shaderAddr, levels, state, csp, returnAddr
+      FirstRelevantOutgoingJumpArgIdx = 5;
+    }
+  }
+
+  PreservedAnalyses run(ModuleAnalysisManager &AnalysisManager) {
+    collectFunctions();
+    collectJumpsAndAwaits();
+
+    if (!Opts.DisableAnalysis) {
+      for (auto &[F, Data] : ToProcess)
+        analyze(F, Data);
+    } else {
+      LLVM_DEBUG({ dbgs() << "[SDS] Analysis is disabled, skipping"; });
+    }
+
+    bool DoSpecialize = true;
+    if (TraversalFunctions.empty()) {
+      DoSpecialize = false;
+    } else if (!HadNonTrivialIncomingTraversalArgsInfo) {
+      DoSpecialize = false;
+      LLVM_DEBUG({ dbgs() << "[SDS] No incoming traversal args info, skipping specialization\n"; });
+    } else if (Opts.DisableSpecialization) {
+      DoSpecialize = false;
+      LLVM_DEBUG({ dbgs() << "[SDS] Specialization disabled, skipping specialization\n"; });
+    }
+    if (DoSpecialize) {
+      for (Function *TraversalFunc : TraversalFunctions)
+        specializeFunction(TraversalFunc, TraversalArgsInfo);
+      return PreservedAnalyses::none();
+    }
+    return PreservedAnalyses::all();
+  }
+
+  void collectFunctions() {
+    for (Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+
+      auto Stage = lgc::rt::getLgcRtShaderStage(&F);
+      if (!Stage)
+        continue;
+
+      if (*Stage == lgc::rt::RayTracingShaderStage::Callable || *Stage == lgc::rt::RayTracingShaderStage::KernelEntry) {
+        // CallShader is not allowed in AHS/Intersection, so we can ignore callable shaders.
+        // Note that we don't have a way to differentiate TraceRay awaits from CallShader awaits
+        // in RayGen/CHS/Miss, and so pessimistically include CallShader awaits in the analysis.
+        continue;
+      }
+
+      FunctionData Data{};
+      Data.Stage = *Stage;
+      Data.IsDuringTraversal = [](lgc::rt::RayTracingShaderStage Stage) {
+        switch (Stage) {
+        case lgc::rt::RayTracingShaderStage::ClosestHit:
+        case lgc::rt::RayTracingShaderStage::Miss:
+        case lgc::rt::RayTracingShaderStage::RayGeneration: {
+          return false;
+        }
+        case lgc::rt::RayTracingShaderStage::Intersection:
+        case lgc::rt::RayTracingShaderStage::AnyHit:
+          // For Traversal, we also analyze jumps out of Traversal to CHS/Miss, which is not required and could
+          // restrict optimization opportunities unnecessarily. In practice, it shouldn't matter though.
+        case lgc::rt::RayTracingShaderStage::Traversal:
+          return true;
+        case lgc::rt::RayTracingShaderStage::Callable:
+        case lgc::rt::RayTracingShaderStage::KernelEntry:
+        case lgc::rt::RayTracingShaderStage::Count:
+          report_fatal_error("Unexpected shader stage " + Twine(static_cast<int>(Stage)));
+        }
+        report_fatal_error("Unknown shader stage " + Twine(static_cast<int>(Stage)));
+      }(*Stage);
+
+      [[maybe_unused]] bool DidInsert = ToProcess.insert({&F, std::move(Data)}).second;
+      assert(DidInsert);
+      if (*Stage == lgc::rt::RayTracingShaderStage::Traversal)
+        TraversalFunctions.push_back(&F);
+    }
+  }
+
+  void collectJumpsAndAwaits() {
+    struct State {
+      SpecializeDriverShadersPassImpl &Self;
+    };
+
+    static const auto HandleJumpOrAwait = [](State &State, Instruction &Op) {
+      Function *F = Op.getFunction();
+      auto *CI = cast<CallInst>(&Op);
+      auto *It = State.Self.ToProcess.find(F);
+      if (It == State.Self.ToProcess.end())
+        return;
+
+      FunctionData &Data = It->second;
+      if (isa<lgc::cps::JumpOp>(Op)) {
+        Data.Jumps.push_back({CI, State.Self.FirstRelevantOutgoingJumpArgIdx});
+      } else {
+        assert(isa<lgc::cps::AwaitOp>(Op));
+        // ignore: shaderAddr, levels, shaderRecIdx
+        Data.Awaits.push_back({{CI, 3}, CI});
+      }
+    };
+
+    static const auto Visitor =
+        llvm_dialects::VisitorBuilder<State>().addSet<lgc::cps::JumpOp, lgc::cps::AwaitOp>(HandleJumpOrAwait).build();
+
+    State S{*this};
+    Visitor.visit(S, M);
+
+    // Also collect legacy awaits.
+    // Because there can be multiple overloads, we need to collect all functions starting with "await".
+    for (auto &F : M.functions()) {
+      if (F.getName().starts_with("await")) {
+        forEachCall(F, [&](CallInst &AwaitResult) {
+          Function *ContainingFunc = AwaitResult.getFunction();
+          auto *It = ToProcess.find(ContainingFunc);
+          if (It == ToProcess.end())
+            return; // ignore this call
+
+          // Legacy awaits look like this:
+          //   %awaitHandle = call ptr inttoptr (i32 %target to ptr)(args...)
+          //   %awaitResult = call @await(ptr %awaitedResult)
+          assert(AwaitResult.arg_size() == 1);
+          auto *AwaitHandle = cast<CallInst>(AwaitResult.getArgOperand(0));
+          assert(AwaitHandle->getType()->isPointerTy());
+          FunctionData &Data = It->second;
+          // Legacy awaited calls have only normal args.
+          // The awaited function is indirectly called, and thus not an arg,
+          // and the optional wait mask is on metadata.
+          unsigned FirstRelevantArgIdx = 1; // ignore return address
+          Data.Awaits.push_back({{AwaitHandle, FirstRelevantArgIdx}, &AwaitResult});
+        });
+      }
+    }
+  }
+
+  const ArgumentLayoutInfo &getOrComputeArgumentLayoutInfo(Type *Ty) {
+    auto It = ArgLayoutInfos.find(Ty);
+    if (It == ArgLayoutInfos.end())
+      It = ArgLayoutInfos.insert({Ty, std::make_unique<ArgumentLayoutInfo>(ArgumentLayoutInfo::get(Ty, DL))}).first;
+
+    return *It->second;
+  };
+
+  // If IsDuringTraversal is false, returns a trivial info, because there is nothing to preserve.
+  // Otherwise, collect incoming args, and a mapping of await results to incoming function args
+  // so the value origin tracker handles await results like incoming function args.
+  IncomingArgSlotValuesWithOffsets computeToBePreservedIncomingArgSlots(Function *F, const FunctionData &Data) {
+    if (!Data.IsDuringTraversal)
+      return {};
+
+    IncomingArgSlotValuesWithOffsets Result{};
+
+    // Collect incoming args
+    for (unsigned ArgIdx = FirstRelevantIncomingArgIdx; ArgIdx < F->arg_size(); ++ArgIdx) {
+      Value *Arg = F->getArg(ArgIdx);
+      const ArgumentLayoutInfo &ArgLayoutInfo = getOrComputeArgumentLayoutInfo(Arg->getType());
+
+      for (unsigned CurArgSlot = 0; CurArgSlot < ArgLayoutInfo.numArgumentSlots(); ++CurArgSlot) {
+        auto LayoutSlotInfo = ArgLayoutInfo.SlotInfos[CurArgSlot];
+        ValueWithOffset CurArgSlotInfo{Arg, 0};
+        if (LayoutSlotInfo.CoversAlignedDword) {
+          CurArgSlotInfo.ByteOffset = LayoutSlotInfo.ByteOffset;
+        } else {
+          // We can't analyze this arg slot. Invalidate.
+          CurArgSlotInfo.Val = nullptr;
+        }
+        DETAIL_DEBUG({
+          dbgs() << "[SDS] Analyze global incoming arg slot " << Result.ArgSlots.size() << ": ";
+          if (CurArgSlotInfo.Val)
+            dbgs() << *CurArgSlotInfo.Val << ", offset " << CurArgSlotInfo.ByteOffset << "\n";
+          else
+            dbgs() << "<unknown>\n";
+        });
+        Result.ArgSlots.push_back({CurArgSlotInfo});
+      }
+    }
+
+    // Collect await results, decompose them into virtual incoming argument slots, and map
+    // these argument slots to the corresponding incoming function argument slots.
+    // Then, add assumptions for value origin tracking that assume await result argument
+    // slots to equal the mapped incoming argument slots.
+    // We could alternatively map them to the corresponding outgoing await args,
+    // but that doesn't make a difference as the outgoing await is separately analyzed,
+    // and non-preserved args are detected when doing that.
+    Result.AwaitOriginAssumptions.emplace();
+    for (const auto &AwaitInfo : Data.Awaits) {
+      auto *AwaitResult = AwaitInfo.AwaitedResult;
+      // Await results are expected to be a struct type that wraps the actual args
+      // We treat the struct members like incoming function arguments,
+      // because await lowering will turn the part after the await into a function that takes exactly
+      // the struct members as arguments.
+      // For each element of the struct, compute its argument layout, which gives a partial covering of the
+      // in-memory-layout of the type with dwords as used in the argument layout.
+      // Then, construct an OriginAssumption that maps those slices of the await result that
+      // have a corresponding arg slot to the value and offset of that incoming arg slot,
+      // and map slices covered by padding to themselves.
+      // If there are argument slots that do not correspond to full aligned dword in the containing type,
+      // conservatively ignore these arg slots, and to not add assumptions.
+      auto *STy = cast<StructType>(AwaitResult->getType());
+      assert(!STy->isPacked() && "packed await result structs not supported");
+      const auto &SL = DL.getStructLayout(STy);
+      ValueTracking::ValueInfo &OriginAssumption = (*Result.AwaitOriginAssumptions)[AwaitResult];
+
+      unsigned AccumArgSlot = 0;
+      bool Stop = false;
+      for (unsigned ElemIdx = 0; ElemIdx < STy->getNumElements() && !Stop; ++ElemIdx) {
+        auto *ElemTy = STy->getElementType(ElemIdx);
+        unsigned ElementByteOffset = SL->getElementOffset(ElemIdx);
+        if (ElementByteOffset % 4 != 0) {
+          // Don't add assumptions for this element.
+          continue;
+        }
+        const ArgumentLayoutInfo &ArgLayoutInfo = getOrComputeArgumentLayoutInfo(ElemTy);
+        unsigned NumArgSlots = ArgLayoutInfo.numArgumentSlots();
+
+        for (unsigned LocalArgSlotIdx = 0; LocalArgSlotIdx < NumArgSlots; ++LocalArgSlotIdx) {
+          unsigned GlobalArgSlotIdx = AccumArgSlot + LocalArgSlotIdx;
+          if (GlobalArgSlotIdx >= Result.ArgSlots.size()) {
+            // We ran out of incoming arguments to map to, stop.
+            // Assumptions on prefixes of values are supported.
+            Stop = true;
+            break;
+          }
+          // There is a corresponding incoming argument
+          // Before we add this slice, mapping to the incoming arg slot,
+          // ensure we are at the correct slice, and add dummy padding slices if necessary
+          auto LayoutSlotInfo = ArgLayoutInfo.SlotInfos[LocalArgSlotIdx];
+          if (!LayoutSlotInfo.CoversAlignedDword) {
+            // Can't analyze this arg slot, don't add an assumption
+            continue;
+          }
+          unsigned LocalByteOffset = LayoutSlotInfo.ByteOffset;
+          while (OriginAssumption.Slices.size() * 4 < ElementByteOffset + LocalByteOffset) {
+            ValueTracking::SliceInfo TrivialAssumption{ValueTracking::SliceStatus::Dynamic};
+            TrivialAssumption.DynamicValue = AwaitResult;
+            TrivialAssumption.DynamicValueByteOffset = OriginAssumption.Slices.size() * 4;
+            OriginAssumption.Slices.push_back(TrivialAssumption);
+          }
+          assert(OriginAssumption.Slices.size() * 4 == ElementByteOffset + LocalByteOffset);
+          const ValueWithOffset &InputArgSlotInfo = Result.ArgSlots[GlobalArgSlotIdx];
+          if (InputArgSlotInfo.Val == nullptr) {
+            // Overlapping scalars, can't analyze arg slot and can't add assumption
+            continue;
+          }
+          ValueTracking::SliceInfo ArgSlotAssumption{ValueTracking::SliceStatus::Dynamic};
+          ArgSlotAssumption.DynamicValue = InputArgSlotInfo.Val;
+          ArgSlotAssumption.DynamicValueByteOffset = InputArgSlotInfo.ByteOffset;
+          DETAIL_DEBUG({
+            dbgs() << "[SDS] Mapping arg slot " << GlobalArgSlotIdx << " of await result ";
+            AwaitResult->printAsOperand(dbgs());
+            dbgs() << " (element idx " << ElemIdx << ", element type " << *ElemTy << ", local byte offset "
+                   << LocalByteOffset << ") to input arg " << *InputArgSlotInfo.Val << ", offset "
+                   << InputArgSlotInfo.ByteOffset << "\n";
+          });
+          OriginAssumption.Slices.push_back(ArgSlotAssumption);
+        }
+        AccumArgSlot += NumArgSlots;
+      }
+    }
+
+    return Result;
+  }
+
+  // Given an outgoing arg slot and the value passed to it, determine the status of that arg slot (e.g. whether it
+  // preserves an incoming one, passes a constant, an undef/poison, or an unknown dymamic value).
+  // The arg slot is identified by GlobalArgSlotIndex.
+  // For instance, the third arg slot in call(i32, i64 %foo) has the global arg slot index 2,
+  // value %foo and local arg slot index 1, because it is the second dword of %foo.
+  ArgSlotInfo computeOutgoingArgSlotInfo(const IncomingArgSlotValuesWithOffsets &ToBePreservedIncomingArgsInfos,
+                                         Value *Arg, const ArgumentLayoutSlotInfo &LayoutSlotInfo,
+                                         unsigned GlobalArgSlotIndex, ValueOriginTracker &VOT) {
+    if (!LayoutSlotInfo.CoversAlignedDword) {
+      DETAIL_DEBUG({ dbgs() << "[SDS] Can't analyze arg slot, doesn't cover aligned dword\n"; });
+      return ArgSlotInfo{ArgSlotStatus::Dynamic};
+    }
+    unsigned LocalByteOffset = LayoutSlotInfo.ByteOffset;
+    assert(LocalByteOffset % 4 == 0);
+    const ValueTracking::ValueInfo &ArgVI = VOT.getValueInfo(Arg);
+    DETAIL_DEBUG({
+      dbgs() << "[SDS] byte offset " << LocalByteOffset << " of " << *Arg << ", global slot " << GlobalArgSlotIndex
+             << "\n";
+    });
+    unsigned SliceIdx = LocalByteOffset / 4;
+
+    if (SliceIdx >= ArgVI.Slices.size()) {
+      // No value origin info for this arg slot, give up
+      DETAIL_DEBUG(dbgs() << "[SDS] no slice info\n";);
+      return ArgSlotInfo{ArgSlotStatus::Dynamic};
+    }
+
+    // we have a slice info for the current outgoing argument slot
+    const ValueTracking::SliceInfo &ArgSI = ArgVI.Slices[SliceIdx];
+    if (ArgSI.Status.contains(ValueTracking::SliceStatus::Dynamic)) {
+      if (GlobalArgSlotIndex >= ToBePreservedIncomingArgsInfos.ArgSlots.size()) {
+        // There is no corresponding incoming argument on the same slot, so we already know
+        // this can't be a preserved value. Give up on this argument slot.
+        DETAIL_DEBUG({ dbgs() << "[SDS] no incoming arg slot. SI: " << ArgSI << "\n"; });
+        return ArgSlotInfo{ArgSlotStatus::Dynamic};
+      }
+
+      // In case the outgoing value is obtained from a phi node that forwards either an incoming
+      // argument or an await result, the value origin assumptions that map await results
+      // to input arguments allow the value origin tracker to point to input args in these cases,
+      // also with nested phis. Thus, we don't have to deal with phi nodes here,
+      // and can directly compare against the incoming arg.
+      if (ToBePreservedIncomingArgsInfos.ArgSlots[GlobalArgSlotIndex] !=
+          ValueWithOffset{ArgSI.DynamicValue, ArgSI.DynamicValueByteOffset}) {
+        DETAIL_DEBUG({
+          const auto &TBP = ToBePreservedIncomingArgsInfos.ArgSlots[GlobalArgSlotIndex];
+          dbgs() << "[SDS] no match. ArgSI: " << ArgSI << ", to be preserved: " << *TBP.Val << ", offset "
+                 << TBP.ByteOffset << "\n";
+        });
+        return ArgSlotInfo{ArgSlotStatus::Dynamic};
+      }
+
+      // All paths that use a dynamic value for this outgoing arg slot preserve the incoming arg slot,
+      // so we can ignore this. Check other status first, and assign Preserve status if there are no others.
+    }
+
+    if (ArgSI.Status.contains(ValueTracking::SliceStatus::Constant)) {
+      // Do this even if the value might be undef, as it is feasible to combine undef and constant into constant.
+      // If we want to conservatively treat undef/poison as zero in the future, we'd need to change this.
+      DETAIL_DEBUG({ dbgs() << "[SDS] Constant: " << ArgSI.ConstantValue << "\n"; });
+      return ArgSlotInfo{ArgSlotStatus::Constant, ArgSI.ConstantValue};
+    }
+
+    if (ArgSI.Status.contains(ValueTracking::SliceStatus::UndefOrPoison)) {
+      DETAIL_DEBUG({ dbgs() << "[SDS] UndefOrPoison:\n"; });
+      return ArgSlotInfo{ArgSlotStatus::UndefOrPoison};
+    }
+
+    assert(ArgSI.Status == ValueTracking::SliceStatus::Dynamic);
+    DETAIL_DEBUG({ dbgs() << "[SDS] Preserve:\n"; });
+    return ArgSlotInfo{ArgSlotStatus::Preserve};
+  }
+
+#ifndef NDEBUG
+  // Sort JumpInfos by instruction order in the containing function.
+  // This ensures processing order (and thereby debug output order) matches input IR order for lit tests.
+  void sortByInstructionOrder(SmallVectorImpl<JumpInfo> &JumpInfos) const {
+    if (JumpInfos.empty())
+      return;
+    Function *F = JumpInfos[0].Outgoing->getFunction();
+
+    // Maps instructions to entry indices in JumpInfos
+    SmallDenseMap<const Instruction *, unsigned> JumpToIndex;
+    for (const auto &[Index, JumpInfo] : enumerate(JumpInfos)) {
+      assert(JumpInfo.Outgoing->getFunction() == F);
+      [[maybe_unused]] auto Inserted = JumpToIndex.insert({JumpInfo.Outgoing, Index}).second;
+      assert(Inserted);
+    }
+
+    SmallVector<JumpInfo> Result;
+    Result.reserve(JumpInfos.size());
+    for (const auto &BB : *F) {
+      for (const auto &Inst : BB) {
+        auto It = JumpToIndex.find(&Inst);
+        if (It != JumpToIndex.end()) {
+          Result.push_back(JumpInfos[It->second]);
+          JumpToIndex.erase(It);
+        }
+      }
+    }
+    assert(Result.size() == JumpInfos.size());
+
+    JumpInfos = std::move(Result);
+  }
+#endif
+
+  // Collect and return the set of outgoing jumps/awaits that may be during Traversal.
+  SmallVector<JumpInfo> getRelevantOutgoingJumpsAndAwaits(const FunctionData &Data) const {
+    SmallVector<JumpInfo> JumpsAndAwaits;
+    JumpsAndAwaits.reserve(Data.Jumps.size() + Data.Awaits.size());
+    for (const auto &AwaitInfo : Data.Awaits)
+      JumpsAndAwaits.push_back(AwaitInfo);
+
+    // Ignore jumps in shaders outside of Traversal:
+    // These are shader returns, and thus are neither during Traversal, nor entering Traversal.
+    if (Data.IsDuringTraversal)
+      JumpsAndAwaits.append(Data.Jumps);
+
+#ifndef NDEBUG
+    if (M.getNamedMetadata("lgc.rt.specialize.driver.shaders.process.in.instruction.order"))
+      sortByInstructionOrder(JumpsAndAwaits);
+#endif
+
+    return JumpsAndAwaits;
+  }
+
+  // This is a performance optimization.
+  // We know that we are going to query the ValueOriginTracker about all arguments passed to all of these
+  // jumps and awaits. The value origin analysis is more efficient when done in bulk, so do that here.
+  // The later queries will then return cached results.
+  void runValueTrackingAnalysisOnAllOutgoingArgs(ValueOriginTracker &VOT, ArrayRef<JumpInfo> JumpsAndAwaits) {
+    SmallVector<Value *> OutgoingArgs;
+    for (const auto &JumpOrAwait : JumpsAndAwaits) {
+      for (unsigned OutgoingArgIdx = JumpOrAwait.FirstRelevantOutgoingArgIdx;
+           OutgoingArgIdx < JumpOrAwait.Outgoing->arg_size(); ++OutgoingArgIdx) {
+        Value *OutgoingArg = JumpOrAwait.Outgoing->getArgOperand(OutgoingArgIdx);
+        // This might add duplicates, but that's fine.
+        OutgoingArgs.push_back(OutgoingArg);
+      }
+    }
+    VOT.analyzeValues(OutgoingArgs);
+  }
+
+  void analyze(Function *F, FunctionData &Data) {
+    // We analyze both jumps and awaits.
+    // We treat all awaits as potentially starting or continuing Traversal.
+    // This is accurate for TraceRay and ReportHit, and pessimistic for CallShader.
+    //
+    // At this stage, before coro passes, jumps come from two sources:
+    //   * app shader returns
+    //   * Traversal enqueues
+    //
+    // In both cases, we determine based on the shader type whether jumps may be in Traversal state.
+    // For in-Traversal shaders, we analyze all jumps and awaits, and preserving arguments is allowed.
+    // Otherwise (CHS/Miss/RGS), we ignore outgoing jumps, as they come from app shader returns outside
+    // of Traversal, and do not allow preserving arguments in awaits, because the incoming arguments of these
+    // shaders are set up outside of the Traversal state.
+
+    // Collect information about incoming arguments and results returned by awaits.
+    // These are used to determine potential preserved arguments.
+    auto ToBePreservedInputArgsInfo = computeToBePreservedIncomingArgSlots(F, Data);
+
+    // Filter relevant jumps and awaits. Ignore those known to happen outside of Traversal.
+    auto JumpsAndAwaits = getRelevantOutgoingJumpsAndAwaits(Data);
+
+    // Initialize a new value origin tracker for the current function.
+    // Move AwaitOriginAssumptions into the VOT to prevent a copy, and reset the optional
+    // to prevent unintended accesses.
+    CompilerUtils::ValueOriginTracker::Options Opts{};
+    Opts.BytesPerSlice = ArgSlotSizeInBytes;
+    Opts.MaxBytesPerValue = MaxNumAnalyzedArgSlots * ArgSlotSizeInBytes;
+    // Handle freeze poison conservatively. Optimizing based on it requires to replace affected freeze poison
+    // by something else (e.g. zeroinitializer), which means we'd need to change app shaders and not just
+    // Traversal. As of now, in tests it didn't make a difference.
+    Opts.FreezeMode = CompilerUtils::ValueOriginTracker::Options::FreezeHandlingMode::Dynamic;
+    CompilerUtils::ValueOriginTracker VOT{DL, Opts, std::move(*ToBePreservedInputArgsInfo.AwaitOriginAssumptions)};
+    ToBePreservedInputArgsInfo.AwaitOriginAssumptions.reset();
+
+    // Do a bulk value origin analysis on all relevant outgoing args. This is more efficient than individual
+    // queries.
+    runValueTrackingAnalysisOnAllOutgoingArgs(VOT, JumpsAndAwaits);
+
+    LLVM_DEBUG(dbgs() << "[SDS] Analyzing function " << F->getName() << " (shader stage " << Data.Stage << ")\n";);
+
+    // The summary of preserved/constant outgoing argument infos for this function
+    ArgSlotsInfo FuncArgsInfo;
+    for (auto [JumpOrAwait, FirstRelevantArgIdx] : JumpsAndAwaits) {
+      // The different jump or continue intrinsics have a different amount of "system" arguments that are not
+      // actually passed as argument to the jumped-to function, e.g. the function itself, or possibly a wait mask.
+      // These system arguments come before the actual arguments, and need to be ignored for the argument
+      // analysis.
+
+      ArgSlotsInfo CurOutgoingArgsInfo{};
+      unsigned AccumulatedArgSlotIndex = 0;
+
+      for (unsigned ArgIdx = FirstRelevantArgIdx; ArgIdx < JumpOrAwait->arg_size(); ++ArgIdx) {
+        Value *Arg = JumpOrAwait->getArgOperand(ArgIdx);
+        Type *ArgTy = Arg->getType();
+        const ArgumentLayoutInfo &ArgLayoutInfo = getOrComputeArgumentLayoutInfo(ArgTy);
+        unsigned NumArgSlots = ArgLayoutInfo.numArgumentSlots();
+
+        // LocalArgSlot indexes into arg slots used by the current argument
+        for (unsigned LocalArgSlotIndex = 0; LocalArgSlotIndex < NumArgSlots; ++LocalArgSlotIndex) {
+          // GlobalArgSlot indexes into all arg slots
+          unsigned GlobalArgSlotIndex = AccumulatedArgSlotIndex + LocalArgSlotIndex;
+          const auto &LayoutSlotInfo = ArgLayoutInfo.SlotInfos[LocalArgSlotIndex];
+          CurOutgoingArgsInfo.ArgSlots.push_back(
+              computeOutgoingArgSlotInfo(ToBePreservedInputArgsInfo, Arg, LayoutSlotInfo, GlobalArgSlotIndex, VOT));
+        }
+        AccumulatedArgSlotIndex += NumArgSlots;
+      }
+      LLVM_DEBUG({
+        dbgs() << "[SDS] Analyzed outgoing call " << *JumpOrAwait << "\n";
+        CurOutgoingArgsInfo.printTable(dbgs(), "[SDS] ");
+      });
+      FuncArgsInfo = ArgSlotsInfo::combine(FuncArgsInfo, CurOutgoingArgsInfo);
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "[SDS] Finished analysis of function " << F->getName() << "\n";
+      FuncArgsInfo.printTable(dbgs(), "[SDS] ");
+    });
+    TraversalArgsInfo = ArgSlotsInfo::combine(TraversalArgsInfo, FuncArgsInfo);
+  }
+
+  // GlobalArgSlotBegin is the index of the first argument slot occupied by this argument.
+  struct SpecializeArgResult {
+    Value *Replacement;
+    unsigned NumToBeReplacedDwords;
+    unsigned NumReplacedDwords;
+  };
+
+  using ValueSpecializer = CompilerUtils::ValueSpecializer;
+
+  SpecializeArgResult specializeArgument(const ArgSlotsInfo &SpecializationInfo, ValueSpecializer &VS, Argument *Arg,
+                                         const ArgumentLayoutInfo &ArgumentLayoutInfo, unsigned GlobalArgSlotBegin) {
+    unsigned NumArgSlots = ArgumentLayoutInfo.numArgumentSlots();
+    // Set up data for ValueSpecializer. This requires converting the specialization info from per-arg-slot to
+    // per-dword.
+    SmallVector<ValueSpecializer::DwordSpecializationInfo> SpecializationInfos;
+    unsigned NumBytes = DL.getTypeStoreSize(Arg->getType());
+    unsigned NumDwords = divideCeil(NumBytes, 4);
+    SpecializationInfos.reserve(NumDwords);
+    unsigned NumToBeReplacedDwords = 0;
+
+    for (unsigned LocalArgSlotIdx = 0; LocalArgSlotIdx < NumArgSlots; ++LocalArgSlotIdx) {
+      unsigned GlobalArgSlotIdx = GlobalArgSlotBegin + LocalArgSlotIdx;
+      if (GlobalArgSlotIdx >= SpecializationInfo.ArgSlots.size()) {
+        // No info about this incoming arg slot or further ones, fill up with dynamic fallback ones at the end.
+        break;
+      }
+      const auto &ArgSlotInfo = SpecializationInfo.ArgSlots[GlobalArgSlotIdx];
+      if (ArgSlotInfo.Status == ArgSlotStatus::Dynamic) {
+        // Can't specialize dynamic arg slot
+        continue;
+      }
+      LLVM_DEBUG(
+          { dbgs() << "[SDS] Trying to specialize arg slot " << GlobalArgSlotIdx << " for " << ArgSlotInfo << "\n"; });
+
+      const auto &LayoutSlotInfo = ArgumentLayoutInfo.SlotInfos[LocalArgSlotIdx];
+      if (!LayoutSlotInfo.CoversAlignedDword) {
+        LLVM_DEBUG(
+            { dbgs() << "[SDS] Can't analyze arg slot " << GlobalArgSlotIdx << ", doesn't cover aligned dword\n"; });
+        continue;
+      }
+
+      unsigned LocalByteOffset = LayoutSlotInfo.ByteOffset;
+      assert(LocalByteOffset % 4 == 0);
+      unsigned LocalDwordOffset = LocalByteOffset / 4;
+
+      while (SpecializationInfos.size() < LocalDwordOffset)
+        SpecializationInfos.push_back({ValueSpecializer::SpecializationKind::None});
+      assert(SpecializationInfos.size() == LocalDwordOffset);
+
+      ValueSpecializer::DwordSpecializationInfo SpecializationInfo{};
+      if (ArgSlotInfo.Status == ArgSlotStatus::Constant) {
+        SpecializationInfo.Kind = ValueSpecializer::SpecializationKind::Constant;
+        SpecializationInfo.ConstantValue = ArgSlotInfo.ConstantValue;
+      } else {
+        assert(ArgSlotInfo.Status == ArgSlotStatus::UndefOrPoison || ArgSlotInfo.Status == ArgSlotStatus::Preserve);
+        // If an argument slot is preserved by all shaders, and isn't constant or dynamic,
+        // then it is never initialized, and can be assumed to be poison.
+        // Use frozen poison to prevent propagation of poison into the containing value.
+        SpecializationInfo.Kind = ValueSpecializer::SpecializationKind::FrozenPoison;
+      }
+      SpecializationInfos.push_back(SpecializationInfo);
+      ++NumToBeReplacedDwords;
+    }
+
+    while (SpecializationInfos.size() < NumDwords)
+      SpecializationInfos.push_back({ValueSpecializer::SpecializationKind::None});
+
+    if (NumToBeReplacedDwords == 0) {
+      // Nothing to be done
+      return {};
+    }
+
+    // Preserve the builder insertion point, so argument specialization code is in argument order.
+    // This improves test readability.
+    auto [Replacement, NumReplacedDwords] =
+        VS.replaceDwords(Arg, SpecializationInfos, /* replace uses */ true, /* preserve insert point */ true);
+    return {Replacement, NumToBeReplacedDwords, NumReplacedDwords};
+  }
+
+  void specializeFunction(Function *Func, const ArgSlotsInfo &SpecializationInfo) {
+    LLVM_DEBUG({
+      dbgs() << "[SDS] Specializing function, final args info:\n";
+      TraversalArgsInfo.printTable(dbgs(), "[SDS] ");
+    });
+    unsigned TotalNumToBeReplacedDwords = 0;
+    unsigned TotalNumReplacedDwords = 0;
+    unsigned AccumArgSlotIdx = 0;
+    ValueSpecializer VS{*Func->getParent()};
+
+    for (unsigned ArgIdx = FirstRelevantIncomingArgIdx; ArgIdx < Func->arg_size(); ++ArgIdx) {
+      Argument *Arg = Func->getArg(ArgIdx);
+      const auto &ArgumentLayoutInfo = getOrComputeArgumentLayoutInfo(Arg->getType());
+      auto Result = specializeArgument(SpecializationInfo, VS, Arg, ArgumentLayoutInfo, AccumArgSlotIdx);
+      TotalNumToBeReplacedDwords += Result.NumToBeReplacedDwords;
+      TotalNumReplacedDwords += Result.NumReplacedDwords;
+      AccumArgSlotIdx += ArgumentLayoutInfo.numArgumentSlots();
+      if (AccumArgSlotIdx >= TraversalArgsInfo.ArgSlots.size())
+        break;
+    }
+    LLVM_DEBUG({
+      dbgs() << "[SDS] Replaced " << TotalNumReplacedDwords << " dwords in total, tried " << TotalNumToBeReplacedDwords
+             << " dwords.\n";
+    });
+  }
+};
+
+} // anonymous namespace
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// SpecializeDriverShadersState::Impl
+
+// Pimpl implementation class for SpecializeDriverShadersState.
+struct SpecializeDriverShadersState::Impl {
+  using Self = SpecializeDriverShadersState::Impl;
+
+  ArgSlotsInfo TraversalArgsInfo;
+
+  static llvm::Expected<Self> decodeMsgpack(llvm::msgpack::DocNode &Node) {
+    auto &MapNode = Node.getMap();
+
+    uint64_t Version = 0;
+    auto VersionNode = MapNode[MsgPackFormat::Version];
+    if (!VersionNode.isEmpty())
+      Version = VersionNode.getUInt();
+    if (Version != MsgPackFormat::MajorVersion)
+      return make_error<StringError>("bad/missing specialize-driver-shaders version", inconvertibleErrorCode());
+
+    Self Result{};
+
+    auto &TraversalNode = MapNode[MsgPackFormat::TraversalArgsInfo];
+    auto TraversalArgsInfoOrErr = ArgSlotsInfo::decodeMsgpack(TraversalNode);
+    if (auto Err = TraversalArgsInfoOrErr.takeError())
+      return Err;
+
+    Result.TraversalArgsInfo = *TraversalArgsInfoOrErr;
+    return Result;
+  }
+
+  void encodeMsgpack(llvm::msgpack::DocNode &Node) const {
+    auto &MapNode = Node.getMap(true);
+    MapNode[MsgPackFormat::Version] = MsgPackFormat::MajorVersion;
+    auto &TraversalNode = MapNode[MsgPackFormat::TraversalArgsInfo];
+    TraversalArgsInfo.encodeMsgpack(TraversalNode);
+  }
+
+  static llvm::Expected<Self> fromModuleMetadata(const llvm::Module &M) {
+    auto *MD = M.getNamedMetadata(MetadataFormat::State);
+    if (!MD) {
+      // If there is no metadata, start with a trivial state.
+      return Self{};
+    }
+    unsigned NumOperands = MD->getNumOperands();
+    if (NumOperands != 1)
+      return make_error<StringError>("unexpected number of nodes", inconvertibleErrorCode());
+
+    Self Result{};
+    auto AIOrErr = ArgSlotsInfo::fromMetadata(MD->getOperand(0));
+    if (auto Err = AIOrErr.takeError())
+      return Err;
+    Result.TraversalArgsInfo = *AIOrErr;
+    LLVM_DEBUG(Result.TraversalArgsInfo.printTable(dbgs(), "[SDS] Deserialized state from MD: "););
+
+    return Result;
+  }
+
+  void exportModuleMetadata(llvm::Module &M) const {
+    auto *MD = M.getOrInsertNamedMetadata(MetadataFormat::State);
+    MD->clearOperands();
+    MD->addOperand(TraversalArgsInfo.exportAsMetadata(M.getContext()));
+    LLVM_DEBUG(TraversalArgsInfo.printTable(dbgs(), "[SDS] Serialized state to MD: "););
+  }
+
+  void merge(const Self &Other) {
+    TraversalArgsInfo = ArgSlotsInfo::combine(TraversalArgsInfo, Other.TraversalArgsInfo);
+  }
+
+  bool operator==(const Impl &Other) const { return TraversalArgsInfo == Other.TraversalArgsInfo; }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// SpecializeDriverShadersOptions
+llvm::Expected<SpecializeDriverShadersOptions>
+SpecializeDriverShadersOptions::fromModuleMetadata(const llvm::Module &M) {
+  auto *MD = M.getNamedMetadata(MetadataFormat::Options);
+  if (!MD) {
+    // If there is no metadata, start with trivial options.
+    return SpecializeDriverShadersOptions{};
+  }
+
+  unsigned NumOperands = MD->getNumOperands();
+  if (NumOperands != 1)
+    return make_error<StringError>("unexpected number of nodes", inconvertibleErrorCode());
+
+  auto *OptsNode = MD->getOperand(0);
+
+  auto OptDisableSpecialization = MDHelper::extractZExtI32Constant(OptsNode->getOperand(0));
+  auto OptDisableAnalysis = MDHelper::extractZExtI32Constant(OptsNode->getOperand(1));
+
+  if (!OptDisableSpecialization.has_value() || !OptDisableAnalysis.has_value())
+    return make_error<StringError>("failed to import numeric options", inconvertibleErrorCode());
+
+  if (OptDisableSpecialization.value() >= 2u || OptDisableAnalysis >= 2u)
+    return make_error<StringError>("invalid numerical boolean values", inconvertibleErrorCode());
+
+  SpecializeDriverShadersOptions Result{};
+  Result.DisableAnalysis = (*OptDisableAnalysis != 0);
+  Result.DisableSpecialization = (*OptDisableSpecialization != 0);
+  return Result;
+}
+
+void SpecializeDriverShadersOptions::exportModuleMetadata(llvm::Module &M) const {
+  auto *MD = M.getOrInsertNamedMetadata(MetadataFormat::Options);
+  MD->clearOperands();
+  MD->addOperand(MDTuple::get(M.getContext(), {MDHelper::getI32MDConstant(M.getContext(), DisableSpecialization),
+                                               MDHelper::getI32MDConstant(M.getContext(), DisableAnalysis)}));
+  // In debug builds, after serializing, check that deserializing yields the expected result
+  assert(cantFail(fromModuleMetadata(M)) == *this);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// SpecializeDriverShadersState
+SpecializeDriverShadersState::SpecializeDriverShadersState() : Pimpl{std::make_unique<Impl>()} {
+}
+
+SpecializeDriverShadersState::SpecializeDriverShadersState(const SpecializeDriverShadersState &Other)
+    : SpecializeDriverShadersState() {
+  if (Other.Pimpl)
+    *Pimpl = *Other.Pimpl;
+}
+
+SpecializeDriverShadersState::SpecializeDriverShadersState(SpecializeDriverShadersState &&Other)
+    : Pimpl(std::move(Other.Pimpl)) {
+}
+
+SpecializeDriverShadersState::SpecializeDriverShadersState(std::unique_ptr<Impl> Pimpl) : Pimpl(std::move(Pimpl)) {
+}
+
+SpecializeDriverShadersState::~SpecializeDriverShadersState() noexcept = default;
+
+SpecializeDriverShadersState &SpecializeDriverShadersState::operator=(const SpecializeDriverShadersState &Other) {
+  if (!Other.Pimpl) {
+    Pimpl.reset();
+  } else {
+    if (Pimpl)
+      *Pimpl = *Other.Pimpl;
+    else
+      Pimpl = std::make_unique<Impl>(*Other.Pimpl);
+  }
+  return *this;
+}
+
+SpecializeDriverShadersState &SpecializeDriverShadersState::operator=(SpecializeDriverShadersState &&Other) {
+  Pimpl = std::move(Other.Pimpl);
+  return *this;
+}
+
+llvm::Expected<SpecializeDriverShadersState> SpecializeDriverShadersState::decodeMsgpack(llvm::msgpack::DocNode &Node) {
+  auto Result = Impl::decodeMsgpack(Node);
+  if (auto Err = Result.takeError())
+    return Err;
+  return Self{std::make_unique<Impl>(*Result)};
+}
+
+void SpecializeDriverShadersState::encodeMsgpack(llvm::msgpack::DocNode &Node) const {
+  assert(Pimpl && "Using invalid moved-from object");
+  Pimpl->encodeMsgpack(Node);
+  // In debug builds, after serializing, check that deserializing yields the expected result
+  assert(cantFail(Impl::decodeMsgpack(Node)) == *Pimpl);
+}
+
+llvm::Expected<SpecializeDriverShadersState> SpecializeDriverShadersState::fromModuleMetadata(const llvm::Module &M) {
+  auto Result = Impl::fromModuleMetadata(M);
+  if (auto Err = Result.takeError())
+    return Err;
+  return Self{std::make_unique<Impl>(*Result)};
+}
+
+void SpecializeDriverShadersState::exportModuleMetadata(llvm::Module &M) const {
+  assert(Pimpl && "Using invalid moved-from object");
+  Pimpl->exportModuleMetadata(M);
+  // In debug builds, after serializing, check that deserializing yields the expected result
+  assert(cantFail(Impl::fromModuleMetadata(M)) == *Pimpl);
+}
+
+void SpecializeDriverShadersState::merge(SpecializeDriverShadersState const &Other) {
+  assert(Pimpl && Other.Pimpl && "Using invalid moved-from object");
+  Pimpl->merge(*Other.Pimpl);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// SpecializeDriverShadersPass
+llvm::PreservedAnalyses SpecializeDriverShadersPass::run(llvm::Module &Module,
+                                                         llvm::ModuleAnalysisManager &AnalysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the specialize-driver-shaders pass\n");
+  AnalysisManager.getResult<DialectContextAnalysis>(Module);
+
+  auto StateOrErr = SpecializeDriverShadersState::Impl::fromModuleMetadata(Module);
+  if (!StateOrErr)
+    report_fatal_error(StateOrErr.takeError());
+  SpecializeDriverShadersState::Impl State = std::move(*StateOrErr);
+
+  auto OptsOrErr = SpecializeDriverShadersOptions::fromModuleMetadata(Module);
+  if (!OptsOrErr)
+    report_fatal_error(OptsOrErr.takeError());
+  SpecializeDriverShadersOptions Opts = *OptsOrErr;
+
+  auto Result = SpecializeDriverShadersPassImpl{Module, State.TraversalArgsInfo, Opts}.run(AnalysisManager);
+
+  State.exportModuleMetadata(Module);
+  return Result;
+}
diff --git a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll b/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
index 9530ebd768..2528908010 100644
--- a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
+++ b/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck %s
+; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
-%continuation.token = type { }
-
-declare void @await.void(%continuation.token*)
-declare %continuation.token* @async_fun()
+declare void @lgc.cps.await__void(...)
+declare ptr @async_fun(i64, i32)
 declare void @lgc.cps.jump(...)
 declare void @lgc.cps.complete()
 
@@ -19,12 +17,14 @@ define <4 x i32> @simple_await(i64 %dummyRet, <4 x i32> %arg) !continuation.regi
 ; CHECK-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    [[DUMMYRET_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 [[DUMMYRET]], ptr addrspace(32) [[DUMMYRET_SPILL_ADDR]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CHECK-NEXT:    unreachable
 ;
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
+  %callee = ptrtoint ptr @async_fun to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
   call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i64 poison, <4 x i32> %arg), !continuation.registercount !1
   unreachable
 }
@@ -38,12 +38,14 @@ define void @simple_await_entry(i64 %dummyRet, <4 x i32> %arg, <4 x i32> addrspa
 ; CHECK-NEXT:    store ptr addrspace(1) [[MEM]], ptr addrspace(32) [[MEM_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CHECK-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CHECK-NEXT:    unreachable
 ;
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
+  %callee = ptrtoint ptr @async_fun to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
   store <4 x i32> %arg, <4 x i32> addrspace(1)* %mem
   call void @lgc.cps.complete(), !continuation.registercount !1
   unreachable
diff --git a/llvmraytracing/test/dx/cleanup-continuations.ll b/llvmraytracing/test/dx/cleanup-continuations.ll
index 9ddee2abb0..f13588003e 100644
--- a/llvmraytracing/test/dx/cleanup-continuations.ll
+++ b/llvmraytracing/test/dx/cleanup-continuations.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
-; RUN: opt --verify-each -passes='legacy-cleanup-continuations,lint,continuations-lint' -S %s --lint-abort-on-error | FileCheck %s
+; RUN: opt --verify-each -passes='dxil-cleanup-continuations,lint,continuations-lint' -S %s --lint-abort-on-error | FileCheck %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
@@ -22,15 +22,17 @@ define { i8*, %continuation.token* } @simple_await(i64 %dummyRet, i8* %0) !conti
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
 ; CHECK-NEXT:    [[DOTSPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    store i64 -1, ptr addrspace(32) [[DOTSPILL_ADDR]], align 4
+; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP0]], i64 2), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
   %FramePtr = bitcast i8* %0 to %simple_await.Frame*
   %.spill.addr = getelementptr inbounds %simple_await.Frame, %simple_await.Frame* %FramePtr, i32 0, i32 0
   store i64 -1, i64* %.spill.addr, align 4
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !4, !continuation.returnedRegistercount !4
+  %callee = ptrtoint ptr @async_fun to i64
+  %tok = call %continuation.token* @async_fun(i64 %callee, i64 1, i64 2), !continuation.registercount !4, !continuation.returnedRegistercount !4
   %1 = insertvalue { i8*, %continuation.token* } { i8* bitcast ({ i8*, %continuation.token* } (i8*, i1)* @simple_await.resume.0 to i8*), %continuation.token* undef }, %continuation.token* %tok, 1
   ret { i8*, %continuation.token* } %1
 }
@@ -45,7 +47,7 @@ define internal { i8*, %continuation.token* } @simple_await.resume.0(i8* noalias
 ; CHECK-NEXT:    [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i64, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DOTRELOAD]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DOTRELOAD]], i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 entryresume.0:
@@ -53,7 +55,7 @@ entryresume.0:
   %vFrame = bitcast %simple_await.Frame* %FramePtr to i8*
   %.reload.addr = getelementptr inbounds %simple_await.Frame, %simple_await.Frame* %FramePtr, i32 0, i32 0
   %.reload = load i64, i64* %.reload.addr, align 4
-  call void (...) @lgc.cps.jump(i64 %.reload, i32 -1, {} poison, i64 poison), !continuation.registercount !4
+  call void (...) @lgc.cps.jump(i64 %.reload, i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount !4
   unreachable
 }
 
@@ -63,13 +65,15 @@ define { i8*, %continuation.token* } @simple_await_entry(i64 %dummyRet, i8* %0)
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
+; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP0]], i64 2), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
   %FramePtr = bitcast i8* %0 to %simple_await_entry.Frame*
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !4, !continuation.returnedRegistercount !4
+  %callee = ptrtoint ptr @async_fun to i64
+  %tok = call %continuation.token* @async_fun(i64 %callee, i64 1, i64 2), !continuation.registercount !4, !continuation.returnedRegistercount !4
   %1 = bitcast { i8*, %continuation.token* } (i8*, i1)* @simple_await_entry.resume.0 to i8*
   %2 = insertvalue { i8*, %continuation.token* } undef, i8* %1, 0
   %3 = insertvalue { i8*, %continuation.token* } %2, %continuation.token* %tok, 1
@@ -85,8 +89,6 @@ define internal { i8*, %continuation.token* } @simple_await_entry.resume.0(i8* n
 ; CHECK-NEXT:    [[VFRAME:%.*]] = bitcast ptr addrspace(32) [[FRAMEPTR]] to ptr addrspace(32)
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
 ; CHECK-NEXT:    ret void
-; CHECK:       entryresume.0.split:
-; CHECK-NEXT:    unreachable
 ;
 entryresume.0:
   %FramePtr = bitcast i8* %0 to %simple_await_entry.Frame*
@@ -102,14 +104,16 @@ define { i8*, %continuation.token* } @await_with_ret_value(i64 %dummyRet, i8* %0
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
 ; CHECK-NEXT:    [[DOTSPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    store i64 -1, ptr addrspace(32) [[DOTSPILL_ADDR]], align 4
+; CHECK-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_ret_value.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP1]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]], i64 2), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
   %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
   %.spill.addr = getelementptr inbounds %await_with_ret_value.Frame, %await_with_ret_value.Frame* %FramePtr, i32 0, i32 0
   store i64 -1, i64* %.spill.addr, align 4
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !4, !continuation.returnedRegistercount !4
+  %callee = ptrtoint ptr @async_fun to i64
+  %tok = call %continuation.token* @async_fun(i64 %callee, i64 1, i64 2), !continuation.registercount !4, !continuation.returnedRegistercount !4
   %res = insertvalue { i8*, %continuation.token* } { i8* bitcast ({ i8*, %continuation.token* } (i8*, i1)* @await_with_ret_value.resume.0 to i8*), %continuation.token* undef }, %continuation.token* %tok, 1
   ret { i8*, %continuation.token* } %res
 }
@@ -123,7 +127,7 @@ define internal { i8*, %continuation.token* } @await_with_ret_value.resume.0(i8*
 ; CHECK-NEXT:    [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i64, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DOTRELOAD]], i32 -1, {} poison, i64 poison, i32 [[RES1]]), !continuation.registercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DOTRELOAD]], i32 -1, {} poison, i32 poison, i64 poison, i32 [[RES1]]), !continuation.registercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
   %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
@@ -131,7 +135,7 @@ define internal { i8*, %continuation.token* } @await_with_ret_value.resume.0(i8*
   %.reload.addr = getelementptr inbounds %await_with_ret_value.Frame, %await_with_ret_value.Frame* %FramePtr, i32 0, i32 0
   %.reload = load i64, i64* %.reload.addr, align 4
   %res = call i32 @lgc.ilcps.getReturnValue__i32()
-  call void (...) @lgc.cps.jump(i64 %.reload, i32 -1, {} poison, i64 poison, i32 %res), !continuation.registercount !4
+  call void (...) @lgc.cps.jump(i64 %.reload, i32 -1, {} poison, i32 poison, i64 poison, i32 %res), !continuation.registercount !4
   unreachable
 }
 
@@ -154,7 +158,7 @@ define { i8*, %continuation.token* } @switch_case_unreachable(i64 %dummyRet, i8*
 ; CHECK-NEXT:    br label [[A]]
 ; CHECK:       a:
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRET]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRET]], i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
   %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
@@ -173,7 +177,7 @@ b:
   br label %a
 
 a:
-  call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i64 poison), !continuation.registercount !4
+  call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount !4
   unreachable
 }
 
@@ -194,7 +198,7 @@ define { i8*, %continuation.token* } @phi_of_cont_state(i64 %dummyRet, ptr %Fram
 ; CHECK-NEXT:    [[C:%.*]] = phi ptr addrspace(32) [ [[A]], [[LA]] ], [ [[B]], [[LB]] ]
 ; CHECK-NEXT:    store i64 -1, ptr addrspace(32) [[C]], align 4
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRET]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META2]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRET]], i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
   %cond = trunc i64 %dummyRet to i1
@@ -211,7 +215,7 @@ lb:
 end:
   %c = phi ptr [ %a, %la ], [ %b, %lb ]
   store i64 -1, ptr %c, align 4
-  call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i64 poison), !continuation.registercount !4
+  call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount !4
   unreachable
 }
 
diff --git a/llvmraytracing/test/dx/closest-hit-procedural.ll b/llvmraytracing/test/dx/closest-hit-procedural.ll
index 1ba48b4475..db20264f2e 100644
--- a/llvmraytracing/test/dx/closest-hit-procedural.ll
+++ b/llvmraytracing/test/dx/closest-hit-procedural.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
 ; Check a procedural closest hit shader with hit attributes that does not fit into system data alone
 
@@ -71,6 +71,8 @@ define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hi
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !27 i32 @_cont_DispatchRaysIndex(%struct.DispatchSystemData* nocapture readnone, i32) #1
 
+declare !pointeetys !27 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !27 i32 @_cont_DispatchRaysDimensions(%struct.DispatchSystemData* nocapture readnone, i32) #1
 
@@ -175,7 +177,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP38]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP39]], [19 x i32] poison, [10 x i32] [[TMP40]]), !continuation.registercount [[META16]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP39]], [19 x i32] poison, [10 x i32] [[TMP40]]), !continuation.registercount [[META16]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
diff --git a/llvmraytracing/test/dx/closest-hit-traceray.ll b/llvmraytracing/test/dx/closest-hit-traceray.ll
index ab9465a7a4..f0180e5b27 100644
--- a/llvmraytracing/test/dx/closest-hit-traceray.ll
+++ b/llvmraytracing/test/dx/closest-hit-traceray.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
@@ -37,6 +37,8 @@ declare !pointeetys !20 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitT
 
 declare !pointeetys !22 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
 
+declare !pointeetys !23 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !23 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
 ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -180,7 +182,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP43]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP44]], [19 x i32] poison, [10 x i32] [[TMP50]]), !continuation.registercount [[META16]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP44]], [19 x i32] poison, [10 x i32] [[TMP50]]), !continuation.registercount [[META16]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
diff --git a/llvmraytracing/test/dx/closest-hit.ll b/llvmraytracing/test/dx/closest-hit.ll
index 593fa611a7..d5246fbee9 100644
--- a/llvmraytracing/test/dx/closest-hit.ll
+++ b/llvmraytracing/test/dx/closest-hit.ll
@@ -29,6 +29,8 @@ declare !pointeetys !14 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitT
 
 declare !pointeetys !16 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
 
+declare !pointeetys !17 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !17 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
 ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -140,7 +142,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP26]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [19 x i32] poison, [8 x i32] [[TMP24]]), !continuation.registercount [[META10]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [19 x i32] poison, [8 x i32] [[TMP24]]), !continuation.registercount [[META10]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %ptr = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
diff --git a/llvmraytracing/test/dx/continuation-registercount.ll b/llvmraytracing/test/dx/continuation-registercount.ll
index 1e7418dd21..92953103d4 100644
--- a/llvmraytracing/test/dx/continuation-registercount.ll
+++ b/llvmraytracing/test/dx/continuation-registercount.ll
@@ -1,9 +1,9 @@
 ; RUN: grep -v MAX_REG_10 %s | \
-; RUN:    opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \
+; RUN:    opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \
 ; RUN:    FileCheck -check-prefixes=COMMON,MAX30 %s
 ;
 ; RUN: grep -v MAX_REG_30 %s | \
-; RUN:    opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \
+; RUN:    opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \
 ; RUN:    FileCheck -check-prefixes=COMMON,MAX10 %s
 
 ; The order of metadata on functions is non-deterministic, so make two different runs to match both of them.
@@ -83,6 +83,8 @@ define void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.B
   ret void
 }
 
+declare !pointeetys !35 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !35 {
   ret i32 5
diff --git a/llvmraytracing/test/dx/continuation-stacksize.ll b/llvmraytracing/test/dx/continuation-stacksize.ll
index 37861b2f0a..92a4b61ba1 100644
--- a/llvmraytracing/test/dx/continuation-stacksize.ll
+++ b/llvmraytracing/test/dx/continuation-stacksize.ll
@@ -1,6 +1,6 @@
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
 ; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-STACKSIZE %s
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata' \
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata' \
 ; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-STATESIZE %s
 
 ; The order of metadata on functions is non-deterministic, so make two different runs to match both of them.
@@ -39,6 +39,8 @@ declare !pointeetys !17 %struct.BuiltInTriangleIntersectionAttributes @_cont_Get
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !19 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
 
+declare !pointeetys !21 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
   ret void
 }
diff --git a/llvmraytracing/test/dx/continuation-state.ll b/llvmraytracing/test/dx/continuation-state.ll
index fdff6bdc3f..e36891c69c 100644
--- a/llvmraytracing/test/dx/continuation-state.ll
+++ b/llvmraytracing/test/dx/continuation-state.ll
@@ -1,26 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
+; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
-%continuation.token = type { }
-
-declare void @await.void(%continuation.token*)
+declare void @lgc.cps.await__void(...)
 declare i32 @_cont_GetContinuationStackAddr()
-declare %continuation.token* @async_fun()
+declare ptr @async_fun(i64, i32)
 declare void @lgc.cps.jump(...)
 declare void @lgc.cps.complete()
 
 define <4 x i32> @simple_await(i64 %returnAddr, <4 x i32> %arg) !continuation.registercount !1 {
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
+  %callee = ptrtoint ptr @async_fun to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
   call void (...) @lgc.cps.jump(i64 %returnAddr, i32 -1, i64 poison, i64 poison, <4 x i32> %arg), !continuation.registercount !1
   unreachable
 }
 
 define void @simple_await_entry(i64 %returnAddr, <4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 {
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
+  %callee = ptrtoint ptr @async_fun to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 3), !continuation.registercount !1, !continuation.returnedRegistercount !1
   store <4 x i32> %arg, <4 x i32> addrspace(1)* %mem
   call void @lgc.cps.complete(), !continuation.registercount !1
   unreachable
@@ -41,18 +39,20 @@ define void @simple_await_entry(i64 %returnAddr, <4 x i32> %arg, <4 x i32> addrs
 ; CLEANUP-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
 ; CLEANUP-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; CLEANUP-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CLEANUP-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
+; CLEANUP-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CLEANUP-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CLEANUP-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @simple_await.resume.0(
 ; CLEANUP-SAME: i64 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META3]] {
 ; CLEANUP-NEXT:  entryresume.0:
-; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
-; CLEANUP-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
+; CLEANUP-NEXT:    [[TMP1:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
+; CLEANUP-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[TMP1]], i32 0, i32 0
 ; CLEANUP-NEXT:    [[ARG_RELOAD:%.*]] = load <4 x i32>, ptr addrspace(32) [[ARG_RELOAD_ADDR]], align 4
-; CLEANUP-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
+; CLEANUP-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[TMP1]], i32 0, i32 1
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 24)
 ; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, i64 poison, i64 poison, <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]]
@@ -67,22 +67,22 @@ define void @simple_await_entry(i64 %returnAddr, <4 x i32> %arg, <4 x i32> addrs
 ; CLEANUP-NEXT:    store ptr addrspace(1) [[MEM]], ptr addrspace(32) [[MEM_SPILL_ADDR]], align 4
 ; CLEANUP-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANUP-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
-; CLEANUP-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
+; CLEANUP-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CLEANUP-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CLEANUP-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @simple_await_entry.resume.0(
 ; CLEANUP-SAME: i64 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META6]] {
 ; CLEANUP-NEXT:  entryresume.0:
-; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
-; CLEANUP-NEXT:    [[MEM_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
+; CLEANUP-NEXT:    [[TMP1:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
+; CLEANUP-NEXT:    [[MEM_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME:%.*]], ptr addrspace(32) [[TMP1]], i32 0, i32 1
 ; CLEANUP-NEXT:    [[MEM_RELOAD:%.*]] = load ptr addrspace(1), ptr addrspace(32) [[MEM_RELOAD_ADDR]], align 4
-; CLEANUP-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
+; CLEANUP-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME]], ptr addrspace(32) [[TMP1]], i32 0, i32 0
 ; CLEANUP-NEXT:    [[ARG_RELOAD:%.*]] = load <4 x i32>, ptr addrspace(32) [[ARG_RELOAD_ADDR]], align 4
 ; CLEANUP-NEXT:    store <4 x i32> [[ARG_RELOAD]], ptr addrspace(1) [[MEM_RELOAD]], align 4
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 24)
 ; CLEANUP-NEXT:    ret void
-; CLEANUP:       entryresume.0.split:
-; CLEANUP-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/continuation-without-await.ll b/llvmraytracing/test/dx/continuation-without-await.ll
index b26552c00e..81b2cbf1f8 100644
--- a/llvmraytracing/test/dx/continuation-without-await.ll
+++ b/llvmraytracing/test/dx/continuation-without-await.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-lint,remove-types-metadata' \
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,continuations-lint,remove-types-metadata' \
 ; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
 ; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
 
 ; @called and @main_no_call must be marked as continuation and end with a continue call to the return address
@@ -33,6 +33,8 @@ declare !pointeetys !16 %struct.BuiltInTriangleIntersectionAttributes @_cont_Get
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !18 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #0
 
+declare !pointeetys !20 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
   ret void
 }
@@ -144,11 +146,11 @@ attributes #2 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [2 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } @await(ptr [[TMP5]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa2i32a1i32s(i64 2, i32 4, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [2 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP6]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP11]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = freeze [[STRUCT_THEIRPARAMS]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] [[TMP9]], ptr [[PARAMS]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
@@ -203,7 +205,7 @@ attributes #2 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP24]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [3 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [2 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META18]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [2 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META18]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -220,7 +222,7 @@ attributes #2 = { nounwind }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 0, 0
 ; CLEANUP-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i32 poison, i64 [[TMP1]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [2 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META22:![0-9]+]], !continuation.returnedRegistercount [[META22]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -229,12 +231,12 @@ attributes #2 = { nounwind }
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP3]], 0
+; CLEANUP-NEXT:    [[TMP4:%.*]] = freeze [[STRUCT_THEIRPARAMS:%.*]] poison
+; CLEANUP-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_THEIRPARAMS]] [[TMP4]], 0, 0
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT4:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    ret void
-; CLEANUP:       entryresume.0.split:
-; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define void @main_no_call(
@@ -243,8 +245,6 @@ attributes #2 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    ret void
-; CLEANUP:       AllocaSpillBB.split:
-; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define void @called(
@@ -259,7 +259,7 @@ attributes #2 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
 ; CLEANUP-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [2 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META18]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [2 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META18]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -290,12 +290,12 @@ attributes #2 = { nounwind }
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP3]], 0
+; POSTPROCESS-NEXT:    [[TMP4:%.*]] = freeze [[STRUCT_THEIRPARAMS:%.*]] poison
+; POSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_THEIRPARAMS]] [[TMP4]], 0, 0
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT4:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    ret void
-; POSTPROCESS:       entryresume.0.split:
-; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define void @main_no_call(
@@ -306,8 +306,6 @@ attributes #2 = { nounwind }
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    ret void
-; POSTPROCESS:       AllocaSpillBB.split:
-; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define void @called(
diff --git a/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll b/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll
index 651080ad60..8c85927b33 100644
--- a/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll
+++ b/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck --check-prefix=PREPARE %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck --check-prefix=ALL %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck --check-prefix=ALL %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
@@ -22,6 +22,8 @@ declare !pointeetys !2 i32 @"\01?_AmdValueGetI32Something@@YA_KXZ"(%struct.Trave
 
 declare !pointeetys !3 void @"\01?_AmdValueSetI32Something@@YA_KXZ"(%struct.TraversalData*, i32, i32)
 
+declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare !pointeetys !9 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
@@ -48,7 +50,7 @@ define void @_cont_Traversal(%struct.TraversalData* %data) #0 !pointeetys !4 {
   %a3 = add i32 %a2, %i3
   %a4 = add i32 %a3, %i4
   %addr = zext i32 %a4 to i64
-  call void @_AmdWaitEnqueueCall(i64 %addr, i64 -1, i64 0, %struct.SystemData* %4) #2
+  call void @_AmdWaitEnqueue(i64 %addr, i64 -1, i64 0, %struct.SystemData* %4) #2
   ret void
 
 6:                                                ; preds = %0
@@ -56,8 +58,6 @@ define void @_cont_Traversal(%struct.TraversalData* %data) #0 !pointeetys !4 {
   ret void
 }
 
-declare !pointeetys !5 void @_AmdWaitEnqueueCall(i64, i64, i64, %struct.SystemData*) #1
-
 declare !pointeetys !5 void @_AmdWaitEnqueue(i64, i64, i64, %struct.SystemData*) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -97,12 +97,11 @@ attributes #2 = { nounwind }
 ; PREPARE-NEXT:    [[A4:%.*]] = add i32 [[A3]], [[I4]]
 ; PREPARE-NEXT:    [[ADDR:%.*]] = zext i32 [[A4]] to i64
 ; PREPARE-NEXT:    [[TMP7:%.*]] = load [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP5]], align 4
-; PREPARE-NEXT:    [[TMP10:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
-; PREPARE-NEXT:    call void (...) @lgc.cps.jump(i64 [[ADDR]], i32 -1, {} poison, i64 [[TMP10]], [[STRUCT_SYSTEMDATA]] [[TMP7]]), !waitmask [[META1:![0-9]+]]
+; PREPARE-NEXT:    call void (...) @lgc.cps.jump(i64 [[ADDR]], i32 -1, {} poison, i32 poison, i64 0, [[STRUCT_SYSTEMDATA]] [[TMP7]]), !waitmask [[META1:![0-9]+]]
 ; PREPARE-NEXT:    unreachable
-; PREPARE:       9:
+; PREPARE:       8:
 ; PREPARE-NEXT:    [[TMP9:%.*]] = load [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], align 4
-; PREPARE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 2, [[STRUCT_SYSTEMDATA]] [[TMP9]]), !waitmask [[META1]]
+; PREPARE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 2, [[STRUCT_SYSTEMDATA]] [[TMP9]]), !waitmask [[META1]]
 ; PREPARE-NEXT:    unreachable
 ;
 ;
@@ -176,7 +175,6 @@ attributes #2 = { nounwind }
 ; ALL-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], i32 0, i32 1
 ; ALL-NEXT:    [[DOTFCA_1_LOAD:%.*]] = load float, ptr [[DOTFCA_1_GEP]], align 4
 ; ALL-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT]], float [[DOTFCA_1_LOAD]], 1
-; ALL-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @_cont_Traversal)
 ; ALL-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 1, 0
 ; ALL-NEXT:    [[DOTFCA_1_INSERT1:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1
 ; ALL-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT1]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2
@@ -208,9 +206,9 @@ attributes #2 = { nounwind }
 ; ALL-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; ALL-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; ALL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; ALL-NEXT:    call void (...) @lgc.ilcps.waitContinue(i64 [[ADDR]], i64 -1, i32 [[TMP11]], i64 [[TMP12]], [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT]], [9 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
+; ALL-NEXT:    call void (...) @lgc.ilcps.waitContinue(i64 [[ADDR]], i64 -1, i32 [[TMP11]], i64 0, [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT]], [9 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]])
 ; ALL-NEXT:    unreachable
-; ALL:       12:
+; ALL:       11:
 ; ALL-NEXT:    [[DOTFCA_0_0_GEP1:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], i32 0, i32 0, i32 0
 ; ALL-NEXT:    [[DOTFCA_0_0_LOAD2:%.*]] = load i32, ptr [[DOTFCA_0_0_GEP1]], align 4
 ; ALL-NEXT:    [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] poison, i32 [[DOTFCA_0_0_LOAD2]], 0, 0
diff --git a/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll b/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll
index 8c5d53e7fa..6c0afec5ea 100644
--- a/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll
+++ b/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll
@@ -41,18 +41,20 @@ AllocaSpillBB:
   call void @amd.dx.setLocalRootIndex(i32 5)
   %ptr = getelementptr i8, ptr addrspace(32) %1, i32 9
   store i32 99, ptr addrspace(32) %ptr
+  %csp = ptrtoint ptr addrspace(32) %ptr to i32
   %dis_data.i.fca.0.insert = insertvalue %struct.DispatchSystemData poison, i32 %.fca.0.extract, 0
   %gep.payload = getelementptr i32, ptr %payload.serialization.alloca, i32 0
   store i32 undef, ptr %gep.payload, align 4
   %3 = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
   %payload.reload = load [1 x i32], ptr %payload.serialization.alloca, align 4
-  call void (...) @lgc.cps.jump(i32 2, i32 2, %struct.type %cont.state, i64 %3, %struct.DispatchSystemData %dis_data.i.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
+  call void (...) @lgc.cps.jump(i32 2, i32 2, %struct.type %cont.state, i32 %csp, i64 %3, %struct.DispatchSystemData %dis_data.i.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
   unreachable
 }
 
 define void @called.resume.0({} %cont.state, i32 %returnAddr, %struct.type %0, { %struct.DispatchSystemData, {}, [1 x i32] } %1) !lgc.rt.shaderstage !15 !lgc.cps !16 !continuation !17 {
 entryresume.0:
   %2 = call ptr addrspace(32) @lgc.cps.peek(i32 8)
+  %csp = ptrtoint ptr addrspace(32) %2 to i32
   %payload.serialization.alloca = alloca [1 x i32], align 4
   %payload = extractvalue  { %struct.DispatchSystemData, {}, [1 x i32] } %1, 2
   store [1 x i32] %payload, ptr %payload.serialization.alloca, align 4
@@ -68,7 +70,7 @@ entryresume.0:
   %.fca.0.insert = insertvalue %struct.DispatchSystemData poison, i32 %.fca.0.extract3, 0
   call void @lgc.cps.free(i32 8)
   %payload.reload = load [1 x i32], ptr %payload.serialization.alloca, align 4
-  call void (...) @lgc.cps.jump(i32 %return.addr.reload, i32 2, %struct.type %0, i64 poison, %struct.DispatchSystemData %.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
+  call void (...) @lgc.cps.jump(i32 %return.addr.reload, i32 2, %struct.type %0, i32 %csp, i64 poison, %struct.DispatchSystemData %.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
   unreachable
 }
 
diff --git a/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll b/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll
index 2c3c98acf8..af6de2dfe7 100644
--- a/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll
+++ b/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll
@@ -41,18 +41,20 @@ AllocaSpillBB:
   call void @amd.dx.setLocalRootIndex(i32 5)
   %ptr = getelementptr i8, ptr addrspace(32) %1, i32 9
   store i32 99, ptr addrspace(32) %ptr
+  %csp = ptrtoint ptr addrspace(32) %ptr to i32
   %dis_data.i.fca.0.insert = insertvalue %struct.DispatchSystemData poison, i32 %.fca.0.extract, 0
   %gep.payload = getelementptr i32, ptr %payload.serialization.alloca, i32 0
   store i32 undef, ptr %gep.payload, align 4
   %3 = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
   %payload.reload = load [1 x i32], ptr %payload.serialization.alloca, align 4
-  call void (...) @lgc.cps.jump(i32 2, i32 2, %struct.type %cont.state, i64 %3, %struct.DispatchSystemData %dis_data.i.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
+  call void (...) @lgc.cps.jump(i32 2, i32 2, %struct.type %cont.state, i32 %csp, i64 %3, %struct.DispatchSystemData %dis_data.i.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
   unreachable
 }
 
 define void @called.resume.0({} %cont.state, i32 %returnAddr, %struct.type %0, { %struct.DispatchSystemData, {}, [1 x i32] } %1) !lgc.rt.shaderstage !15 !lgc.cps !16 !continuation !17 {
 entryresume.0:
   %2 = call ptr addrspace(32) @lgc.cps.peek(i32 8)
+  %csp = ptrtoint ptr addrspace(32) %2 to i32
   %payload.serialization.alloca = alloca [1 x i32], align 4
   %payload = extractvalue  { %struct.DispatchSystemData, {}, [1 x i32] } %1, 2
   store [1 x i32] %payload, ptr %payload.serialization.alloca, align 4
@@ -68,7 +70,7 @@ entryresume.0:
   %.fca.0.insert = insertvalue %struct.DispatchSystemData poison, i32 %.fca.0.extract3, 0
   call void @lgc.cps.free(i32 8)
   %payload.reload = load [1 x i32], ptr %payload.serialization.alloca, align 4
-  call void (...) @lgc.cps.jump(i32 %return.addr.reload, i32 2, %struct.type %0, i64 poison, %struct.DispatchSystemData %.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
+  call void (...) @lgc.cps.jump(i32 %return.addr.reload, i32 2, %struct.type %0, i32 %csp, i64 poison, %struct.DispatchSystemData %.fca.0.insert, {} poison, [1 x i32] %payload.reload), !continuation.registercount !16
   unreachable
 }
 
diff --git a/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll b/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
index 9e0aba31c2..1fbed145f8 100644
--- a/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
+++ b/llvmraytracing/test/dx/free-raygen-cont-state-in-persistent-launch.ll
@@ -1,6 +1,6 @@
 ; Tests that if _cont_ExitRayGen ends with an enqueue, then we still free RayGen continuation state.
 ; This is a regression test, in an earlier version we only freed for returns and missed this case.
-; RUN: grep -v "lgc.cps.module" %s | opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck %s
+; RUN: grep -v "lgc.cps.module" %s | opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck %s
 ; RUN: opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck %s
 
 ; There is just a single RayGen shader in this module, so any free must come from it.
diff --git a/llvmraytracing/test/dx/global-mem-stack.ll b/llvmraytracing/test/dx/global-mem-stack.ll
index c7e726543b..4f2df75b65 100644
--- a/llvmraytracing/test/dx/global-mem-stack.ll
+++ b/llvmraytracing/test/dx/global-mem-stack.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
@@ -55,6 +55,8 @@ define void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.B
   ret void
 }
 
+declare !pointeetys !19 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !19 {
   ret i32 5
 }
diff --git a/llvmraytracing/test/dx/inline-const-jump-target.ll b/llvmraytracing/test/dx/inline-const-jump-target.ll
index 3cb73b3dac..c7f5d05a8e 100644
--- a/llvmraytracing/test/dx/inline-const-jump-target.ll
+++ b/llvmraytracing/test/dx/inline-const-jump-target.ll
@@ -23,6 +23,8 @@ declare i32 @lgc.rt.shader.index()
 
 declare i32 @_cont_GetContinuationStackAddr()
 
+declare !pointeetys !13 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !13 {
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define i32 @_cont_GetLocalRootIndex(
 ; LOWERRAYTRACINGPIPELINE-CPS-SAME: ptr [[DATA:%.*]]) {
@@ -60,7 +62,7 @@ define internal void @Callable(%struct.Payload* %payload) !pointeetys !23 !lgc.r
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[SHADER_INDEX]], ptr @debug_global, align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]]), !continuation.registercount [[META8:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]]), !continuation.registercount [[META8:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 entry:
@@ -73,7 +75,7 @@ define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !pointe
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
   %callable.addr = call i32 @_AmdGetFuncAddrCallable()
   %ret.addr = call i32 @get.ret.addr()
-  call void (...) @lgc.cps.jump(i32 %callable.addr, i32 2, {} poison, i32 %ret.addr, i32 999, %struct.DispatchSystemData %dis_data, {} poison, [0 x i32] poison, [0 x i32] poison)
+  call void (...) @lgc.cps.jump(i32 %callable.addr, i32 2, {} poison, i32 poison, i32 %ret.addr, i32 999, %struct.DispatchSystemData %dis_data, {} poison, [0 x i32] poison, [0 x i32] poison)
   unreachable
 }
 
@@ -88,7 +90,7 @@ define void @main() {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @Callable)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RET_ADDR_I:%.*]] = call i32 @get.ret.addr()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[TMP2]], i32 2, {} poison, i32 [[RET_ADDR_I]], i32 999, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], {} poison, [0 x i32] poison, [0 x i32] poison)
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[TMP2]], i32 2, {} poison, i32 poison, i32 [[RET_ADDR_I]], i32 999, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], {} poison, [0 x i32] poison, [0 x i32] poison)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       _cont_CallShader.exit:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @lgc.cps.complete()
@@ -108,7 +110,7 @@ define void @main() {
 ; JUMP-INLINER-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], ptr [[SYSTEM_DATA_ALLOCA_I]], align 4
 ; JUMP-INLINER-CPS-NEXT:    store i32 999, ptr @debug_global, align 4
 ; JUMP-INLINER-CPS-NEXT:    [[TMP2:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA_I]], align 4
-; JUMP-INLINER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RET_ADDR_I]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]]), !continuation.registercount [[META8]]
+; JUMP-INLINER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RET_ADDR_I]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP2]]), !continuation.registercount [[META8]]
 ; JUMP-INLINER-CPS-NEXT:    unreachable
 ; JUMP-INLINER-CPS:       Callable.exit:
 ; JUMP-INLINER-CPS-NEXT:    unreachable
diff --git a/llvmraytracing/test/dx/intersection-registercount.ll b/llvmraytracing/test/dx/intersection-registercount.ll
index d0e8edd950..38e7b3b443 100644
--- a/llvmraytracing/test/dx/intersection-registercount.ll
+++ b/llvmraytracing/test/dx/intersection-registercount.ll
@@ -1,4 +1,4 @@
-; RUN: opt --verify-each --report-payload-register-sizes=max -passes='dxil-cont-intrinsic-prepare,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,continuations-stats-report,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
+; RUN: opt --verify-each --report-payload-register-sizes=max -passes='dxil-cont-intrinsic-prepare,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,continuations-stats-report,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
 
 ; CHECK: Incoming and max outgoing payload VGPR size of "Intersection" (intersection): 25 and 25 dwords
 
@@ -37,6 +37,8 @@ declare !pointeetys !21 %struct.HitData @_cont_GetCandidateState(%struct.AnyHitT
 
 declare !pointeetys !23 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
 
+declare !pointeetys !24 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !24 {
   ret i32 5
 }
diff --git a/llvmraytracing/test/dx/intrinsics/complete.ll b/llvmraytracing/test/dx/intrinsics/complete.ll
index 38b7c78b8a..4e825e53c2 100644
--- a/llvmraytracing/test/dx/intrinsics/complete.ll
+++ b/llvmraytracing/test/dx/intrinsics/complete.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CLEANUP %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CLEANUP %s
 
 %struct.DispatchSystemData = type { i32 }
 %struct.TraversalData = type { i32 }
@@ -8,6 +8,7 @@
 @debug_global = external global i32
 declare i32 @Val(i32)
 declare void @_AmdComplete()
+declare !pointeetys !2 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !3 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
@@ -60,8 +61,6 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0
 ; CLEANUP-NEXT:    [[VAL:%.*]] = call i32 @Val(i32 5)
 ; CLEANUP-NEXT:    ret void
-; CLEANUP:       AllocaSpillBB.split:
-; CLEANUP-NEXT:    unreachable
 ;
 AllocaSpillBB:
   %val = call i32 @Val(i32 5)
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
index 79ff0e2e14..5e1676fe82 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=ALL %s
+; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=ALL %s
 ; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
 %struct.DispatchSystemData = type { i32 }
@@ -13,6 +13,7 @@
 
 declare i32 @_AmdContPayloadRegistersGetI32(i32)
 
+declare !pointeetys !9 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !9 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare !pointeetys !11 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.DispatchSystemData*)
@@ -58,13 +59,13 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP2]], ptr @debug_global, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP3]]), !waitmask [[META4:![0-9]+]], !continuation.registercount [[META0]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP3]]), !waitmask [[META4:![0-9]+]], !continuation.registercount [[META0]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 entry:
   %val = call i32 @_AmdContPayloadRegistersGetI32(i32 2)
   store i32 %val, i32* @debug_global, align 4
-  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData poison), !waitmask !2
+  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 poison, %struct.SystemData poison), !waitmask !2
   unreachable
 }
 
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
index e3de9f1614..81d22a2670 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=MINCOUNT %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=MINCOUNT %s
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-MINCOUNT %s
 
 %struct.DispatchSystemData = type { i32 }
@@ -9,6 +9,7 @@
 declare i32 @_AmdContPayloadRegistersI32Count()
 %struct.TraversalData = type { i32 }
 
+declare !pointeetys !9 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !9 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !12 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
@@ -26,8 +27,6 @@ define void @main() {
 ; MINCOUNT-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; MINCOUNT-NEXT:    store i32 11, ptr @debug_global, align 4
 ; MINCOUNT-NEXT:    ret void
-; MINCOUNT:       entry.split:
-; MINCOUNT-NEXT:    unreachable
 ;
 ; LOWERRAYTRACINGPIPELINE-MINCOUNT-LABEL: define void @main(
 ; LOWERRAYTRACINGPIPELINE-MINCOUNT-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META10:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] {
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
index e570e21080..40face4df3 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=ALL %s
+; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=ALL %s
 ; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
 %struct.DispatchSystemData = type { i32 }
@@ -11,6 +11,7 @@
 
 declare void @_AmdContPayloadRegistersSetI32(i32, i32)
 
+declare !pointeetys !9 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !9 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare !pointeetys !11 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.DispatchSystemData*)
@@ -52,12 +53,12 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP1:%.*]] = getelementptr [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 42, ptr [[TMP1]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP2]]), !waitmask [[META4:![0-9]+]], !continuation.registercount [[META0]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP2]]), !waitmask [[META4:![0-9]+]], !continuation.registercount [[META0]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 entry:
   call void @_AmdContPayloadRegistersSetI32(i32 3, i32 42)
-  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData poison), !waitmask !2
+  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 poison, %struct.SystemData poison), !waitmask !2
   unreachable
 }
 
diff --git a/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll b/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
index 62185f1380..a48c774dc1 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --version 2
-; RUN: opt --verify-each -passes='cgscc(inline),lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
+; RUN: opt --verify-each -passes='cgscc(inline),lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
 
 declare i32 @_AmdContStackAlloc(i32 %size)
 declare i32 @_AmdContPayloadRegistersI32Count()
@@ -9,6 +9,8 @@ declare i32 @_cont_GetContinuationStackAddr() #0
 %struct.HitData = type { float, i32 }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
 %struct.TraversalData = type { <3 x float>, <3 x float>, float }
+
+declare !pointeetys !15 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !15 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !16 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.DispatchSystemData*)
 declare !pointeetys !12 i32 @_cont_HitKind(%struct.DispatchSystemData*, %struct.HitData*)
@@ -36,8 +38,6 @@ define void @main() !lgc.rt.shaderstage !17 {
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    store i32 [[TMP1]], ptr @debug_global, align 4
 ; CHECK-NEXT:    ret void
-; CHECK:       entry.split:
-; CHECK-NEXT:    unreachable
 ;
 entry:
   %pl_size = call i32 @_AmdContPayloadRegistersI32Count()
diff --git a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll b/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
index 0e87ed5411..9bebd2b02f 100644
--- a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
+++ b/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
@@ -6,6 +6,8 @@
 
 declare void @Use(i64)
 declare i64 @_AmdGetCurrentFuncAddr()
+
+declare !pointeetys !2 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 define void @MyRayGen() {
diff --git a/llvmraytracing/test/dx/intrinsics/get-flags.ll b/llvmraytracing/test/dx/intrinsics/get-flags.ll
deleted file mode 100644
index 2cc945f7e6..0000000000
--- a/llvmraytracing/test/dx/intrinsics/get-flags.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
-
-declare i32 @_AmdContinuationsGetFlags()
-
-@debug_global = external global i32
-
-define void @main() !lgc.rt.shaderstage !1 {
-; CHECK-LABEL: define void @main(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META1:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 3, ptr @debug_global, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %val = call i32 @_AmdContinuationsGetFlags()
-  store i32 %val, ptr @debug_global
-  ret void
-}
-
-!continuation.flags = !{!0}
-
-!0 = !{i32 3}
-!1 = !{i32 0}
diff --git a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll b/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
index 601d72b158..62fe394a5e 100644
--- a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
+++ b/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
@@ -8,6 +8,7 @@
 %struct.TraversalData = type { i32 }
 declare i32 @_AmdGetShaderKind()
 
+declare !pointeetys !3 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !3 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !5 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.DispatchSystemData*)
 declare !pointeetys !6 i32 @_cont_HitKind(%struct.DispatchSystemData*, %struct.HitData*)
@@ -47,7 +48,7 @@ define void @MyMiss(%struct.Payload* %payload) !pointeetys !1 !lgc.rt.shaderstag
 ; CHECK-NEXT:    store i32 [[TMP8]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], [8 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META5]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], [8 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META5]]
 ; CHECK-NEXT:    unreachable
 ;
   %1 = call i32 @_AmdGetShaderKind()
diff --git a/llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll b/llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll
new file mode 100644
index 0000000000..933cf00565
--- /dev/null
+++ b/llvmraytracing/test/dx/intrinsics/get-shader-rec-idx.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck --check-prefix=CHECK-NON-CPS %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CHECK-CPS %s
+
+%struct.DispatchSystemData = type { i32 }
+%struct.MyParams = type { i32 }
+%struct.AnyHitTraversalData = type { i64 }
+
+declare void @Use(i32)
+declare i32 @_AmdGetShaderRecordIndex()
+declare !pointeetys !2 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+declare !pointeetys !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
+declare !pointeetys !5 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
+
+define void @MyRayGen() !lgc.rt.shaderstage !1 {
+; CHECK-NON-CPS-LABEL: define void @MyRayGen(
+; CHECK-NON-CPS-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.entry [[META6:![0-9]+]] !continuation.registercount [[META4]] {
+; CHECK-NON-CPS-NEXT:  AllocaSpillBB:
+; CHECK-NON-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CHECK-NON-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
+; CHECK-NON-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NON-CPS-NEXT:    call void @Use(i32 0)
+; CHECK-NON-CPS-NEXT:    call void @lgc.cps.complete()
+; CHECK-NON-CPS-NEXT:    unreachable
+;
+; CHECK-CPS-LABEL: define void @MyRayGen(
+; CHECK-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !lgc.cps [[META1:![0-9]+]] {
+; CHECK-CPS-NEXT:  AllocaSpillBB:
+; CHECK-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CHECK-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4
+; CHECK-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-CPS-NEXT:    call void @Use(i32 0)
+; CHECK-CPS-NEXT:    call void @lgc.cps.complete()
+; CHECK-CPS-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %idx = call i32 @_AmdGetShaderRecordIndex()
+  call void @Use(i32 %idx)
+  ret void
+}
+
+define void @called(%struct.MyParams* %params) !pointeetys !4 !lgc.rt.shaderstage !3 {
+; CHECK-NON-CPS-LABEL: define %struct.DispatchSystemData @called(
+; CHECK-NON-CPS-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [9 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META7:![0-9]+]] !continuation [[META8:![0-9]+]] !continuation.registercount [[META1:![0-9]+]] {
+; CHECK-NON-CPS-NEXT:  AllocaSpillBB:
+; CHECK-NON-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CHECK-NON-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
+; CHECK-NON-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
+; CHECK-NON-CPS-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-NON-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NON-CPS-NEXT:    [[TMP2:%.*]] = call i32 @_cont_GetLocalRootIndex(ptr [[SYSTEM_DATA_ALLOCA]])
+; CHECK-NON-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; CHECK-NON-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-NON-CPS-NEXT:    store i32 [[TMP4]], ptr [[TMP3]], align 4
+; CHECK-NON-CPS-NEXT:    call void @Use(i32 [[TMP2]])
+; CHECK-NON-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; CHECK-NON-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NON-CPS-NEXT:    store i32 [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-NON-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NON-CPS-NEXT:    [[TMP8:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-NON-CPS-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], [9 x i32] poison, [1 x i32] [[TMP8]]), !continuation.registercount [[META1]]
+; CHECK-NON-CPS-NEXT:    unreachable
+;
+; CHECK-CPS-LABEL: define void @called(
+; CHECK-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [9 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] !lgc.cps [[META8:![0-9]+]] {
+; CHECK-CPS-NEXT:  AllocaSpillBB:
+; CHECK-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CHECK-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4
+; CHECK-CPS-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
+; CHECK-CPS-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-CPS-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP0]], i32 0
+; CHECK-CPS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-CPS-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
+; CHECK-CPS-NEXT:    call void @Use(i32 [[SHADER_INDEX]])
+; CHECK-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP0]], i32 0
+; CHECK-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-CPS-NEXT:    store i32 [[TMP4]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-CPS-NEXT:    [[TMP5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-CPS-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; CHECK-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], [9 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META1]]
+; CHECK-CPS-NEXT:    unreachable
+;
+AllocaSpillBB:
+  %idx = call i32 @_AmdGetShaderRecordIndex()
+  call void @Use(i32 %idx)
+  ret void
+}
+
+!lgc.cps.module = !{}
+!1 = !{i32 0}
+!2 = !{%struct.DispatchSystemData poison}
+!3 = !{i32 5}
+!4 = !{%struct.MyParams poison}
+!5 = !{%struct.AnyHitTraversalData poison}
diff --git a/llvmraytracing/test/dx/intrinsics/shader-index.ll b/llvmraytracing/test/dx/intrinsics/shader-index.ll
index c69eff2f46..1da0d3be02 100644
--- a/llvmraytracing/test/dx/intrinsics/shader-index.ll
+++ b/llvmraytracing/test/dx/intrinsics/shader-index.ll
@@ -8,6 +8,8 @@
 
 declare i32 @lgc.rt.shader.index()
 
+declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 define i1 @_cont_ReportHit(%struct.DispatchSystemData* %data, float %t, i32 %hitKind) #0 !pointeetys !20 {
@@ -55,7 +57,7 @@ define void @callable(%struct.Payload* %payload) !pointeetys !22 !lgc.rt.shaders
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], [8 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META10]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], [8 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META10]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvmraytracing/test/dx/lower-await.ll b/llvmraytracing/test/dx/lower-await.ll
index 1ac5f29e97..1963719ff8 100644
--- a/llvmraytracing/test/dx/lower-await.ll
+++ b/llvmraytracing/test/dx/lower-await.ll
@@ -1,17 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt --verify-each -passes='lower-await,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=AWAIT %s
 ; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CORO %s
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANED %s
+; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANED %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
-%continuation.token = type { }
-
-declare void @await.void(%continuation.token*)
-declare i32 @await.i32(%continuation.token*)
-declare %continuation.token* @async_fun()
-declare %continuation.token* @async_fun_with_waitmask()
-declare %continuation.token* @async_fun_with_arg(i32)
+declare ptr @async_fun(i64, i32)
+declare ptr @async_fun_with_waitmask(i64, i32)
+declare ptr @async_fun_with_arg(i64, i32, i32)
+declare void @lgc.cps.await__void(...)
+declare i32 @lgc.cps.await__i32(...)
 declare void @lgc.cps.jump(...)
 declare void @lgc.cps.complete()
 
@@ -20,9 +18,11 @@ define void @simple_await(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.simple_await, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
+; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @simple_await(
@@ -30,10 +30,12 @@ define void @simple_await(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; CORO-NEXT:  AllocaSpillBB:
 ; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
 ; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await.resume.0, 0
-; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
+; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await.resume.0, 0
+; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
+; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
 ;
 ; CLEANED-LABEL: define void @simple_await(
 ; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
@@ -41,13 +43,15 @@ define void @simple_await(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANED-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CLEANED-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
-  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i64 poison), !continuation.registercount !1
+  %callee = ptrtoint ptr @async_fun to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 2), !continuation.registercount !1, !continuation.returnedRegistercount !1
+  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount !1
   unreachable
 }
 
@@ -56,28 +60,34 @@ define void @simple_await_entry() !continuation.entry !0 !continuation.registerc
 ; AWAIT-SAME: ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.simple_await_entry, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
+; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
 ; AWAIT-NEXT:    call void @lgc.cps.complete()
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @simple_await_entry(
 ; CORO-SAME: ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await_entry.resume.0, 0
-; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
+; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await_entry.resume.0, 0
+; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
+; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
 ;
 ; CLEANED-LABEL: define void @simple_await_entry(
 ; CLEANED-SAME: ) !continuation.registercount [[META1]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.state [[META1]] {
 ; CLEANED-NEXT:  AllocaSpillBB:
-; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CLEANED-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
+  %callee = ptrtoint ptr @async_fun to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 2), !continuation.registercount !1, !continuation.returnedRegistercount !1
   ; Note: entry functions don't need a registercount annotation on return
   call void @lgc.cps.complete()
   unreachable
@@ -88,9 +98,11 @@ define void @await_with_arg(i64 %dummyRetAddr, i32 %i) !continuation.registercou
 ; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.await_with_arg, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_arg to i64
+; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i64 [[CALLEE]], i32 2, i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
+; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @await_with_arg(
@@ -98,10 +110,12 @@ define void @await_with_arg(i64 %dummyRetAddr, i32 %i) !continuation.registercou
 ; CORO-NEXT:  AllocaSpillBB:
 ; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
 ; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_arg.resume.0, 0
-; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
+; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_arg to i64
+; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i64 [[CALLEE]], i32 2, i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_arg.resume.0, 0
+; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
+; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
 ;
 ; CLEANED-LABEL: define void @await_with_arg(
 ; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
@@ -109,13 +123,15 @@ define void @await_with_arg(i64 %dummyRetAddr, i32 %i) !continuation.registercou
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANED-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_arg.resume.0)
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun_with_arg to i64), i32 -1, {} poison, i64 [[TMP0]], i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_arg to i64
+; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CLEANED-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_arg.resume.0)
+; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]], i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
-  %tok = call %continuation.token* @async_fun_with_arg(i32 %i), !continuation.registercount !1,  !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
-  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i64 poison), !continuation.registercount !1
+  %callee = ptrtoint ptr @async_fun_with_arg to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 2, i32 %i), !continuation.registercount !1,  !continuation.returnedRegistercount !1
+  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i32 poison, i64 poison), !continuation.registercount !1
   unreachable
 }
 
@@ -124,35 +140,41 @@ define i32 @await_with_ret_value(i64 %dummyRetAddr) !continuation.registercount
 ; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.await_with_ret_value, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    [[TMP5:%.*]] = call i32 @lgc.ilcps.getReturnValue__i32()
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i64 poison, i32 [[TMP5]]), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
+; AWAIT-NEXT:    [[TMP7:%.*]] = call i32 @lgc.ilcps.getReturnValue__i32()
+; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i64 poison, i32 poison, i32 [[TMP7]]), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @await_with_ret_value(
 ; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_ret_value.resume.0, 0
-; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
+; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
+; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
+; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_ret_value.resume.0, 0
+; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
+; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
 ;
 ; CLEANED-LABEL: define void @await_with_ret_value(
 ; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
 ; CLEANED-NEXT:  AllocaSpillBB:
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
-; CLEANED-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_ret_value.resume.0)
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CLEANED-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
+; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[DUMMYRETADDR_SPILL_ADDR]], align 4
+; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun to i64
+; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CLEANED-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_ret_value.resume.0)
+; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
-  %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
-  %res = call i32 @await.i32(%continuation.token* %tok)
-  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i64 poison, i32 %res), !continuation.registercount !1
+  %callee = ptrtoint ptr @async_fun to i64
+  %res = call i32 (...) @lgc.cps.await__i32(i64 %callee, i32 2), !continuation.registercount !1, !continuation.returnedRegistercount !1
+  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i64 poison, i32 poison, i32 %res), !continuation.registercount !1
   unreachable
 }
 
@@ -161,20 +183,24 @@ define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.wait_await, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_waitmask(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]]
-; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, i64 poison, i64 poison), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_waitmask to i64
+; AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]]
+; AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
+; AWAIT-NEXT:    call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, i64 poison, i32 poison, i64 poison), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @wait_await(
 ; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_waitmask(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]]
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @wait_await.resume.0, 0
-; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
-; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
+; CORO-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
+; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4
+; CORO-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_waitmask to i64
+; CORO-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CORO-NEXT:    [[TMP2:%.*]] = call ptr [[TMP1]](i64 [[CALLEE]], i32 2), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]]
+; CORO-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, ptr } poison, ptr @wait_await.resume.0, 0
+; CORO-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, ptr } [[TMP3]], ptr [[TMP2]], 1
+; CORO-NEXT:    ret { ptr, ptr } [[TMP4]]
 ;
 ; CLEANED-LABEL: define void @wait_await(
 ; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META8:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
@@ -182,13 +208,15 @@ define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANED-NEXT:    [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[DUMMYRETADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @wait_await.resume.0)
-; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun_with_waitmask to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META9:![0-9]+]]
+; CLEANED-NEXT:    [[CALLEE:%.*]] = ptrtoint ptr @async_fun_with_waitmask to i64
+; CLEANED-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[CALLEE]] to ptr
+; CLEANED-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @wait_await.resume.0)
+; CLEANED-NEXT:    call void (...) @lgc.cps.jump(i64 [[CALLEE]], i32 -1, {} poison, i32 poison, i64 [[TMP1]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META9:![0-9]+]]
 ; CLEANED-NEXT:    unreachable
 ;
-  %tok = call %continuation.token* @async_fun_with_waitmask(), !waitmask !3, !continuation.registercount !1, !continuation.returnedRegistercount !1
-  call void @await.void(%continuation.token* %tok)
-  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, i64 poison, i64 poison), !continuation.registercount !1
+  %callee = ptrtoint ptr @async_fun_with_waitmask to i64
+  call void (...) @lgc.cps.await__void(i64 %callee, i32 2), !waitmask !3, !continuation.registercount !1, !continuation.returnedRegistercount !1
+  call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, i64 poison, i32 poison, i64 poison), !continuation.registercount !1
   unreachable
 }
 
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
index 0c341d0fe3..34a5091c23 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
@@ -19,6 +19,8 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 
 declare i32 @_cont_GetContinuationStackAddr()
 
+declare !pointeetys !13 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !13 {
   ret i32 5
 }
@@ -102,15 +104,15 @@ attributes #0 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP3]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [19 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } @await(ptr [[TMP8]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP9]], 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i64 2, i32 4, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [19 x i32] poison, [1 x i32] [[TMP4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP8]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = freeze [[STRUCT_THEIRPARAMS]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] [[TMP11]], ptr [[PARAMS]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP9]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP8]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -139,7 +141,8 @@ attributes #0 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i32 2, i32 4, i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [19 x i32] poison, [1 x i32] [[TMP4]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP5]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = freeze [[STRUCT_THEIRPARAMS]] poison
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_THEIRPARAMS]] [[TMP10]], ptr [[PARAMS]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
@@ -179,6 +182,8 @@ attributes #0 = { nounwind }
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP3]], 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP5]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = freeze [[STRUCT_THEIRPARAMS:%.*]] poison
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT6:%.*]] = extractvalue [[STRUCT_THEIRPARAMS]] [[TMP7]], 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP3]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT4:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll b/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
index f441c64dd2..3eb4bb830b 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
@@ -17,6 +17,8 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 
 declare void @lgc.ilcps.waitContinue(...) noreturn
 
+declare !pointeetys !24 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 declare !pointeetys !24 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data)
 
 declare !pointeetys !27 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
index 2c4aa40ae2..27f47882ba 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
@@ -197,8 +197,6 @@ define void @RayGen() #3 {
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; DXILCONTPOSTPROCESS-NEXT:    ret void
-; DXILCONTPOSTPROCESS:       AllocaSpillBB.split:
-; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   ret void
 }
@@ -234,8 +232,7 @@ define void @Intersection() #3 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], {} poison, [30 x i32] [[TMP13]]), !continuation.registercount [[META26]], !continuation.returnedRegistercount [[META26]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await(ptr [[TMP20]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa2i32a30i32s(i64 3, i32 16, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], {} poison, [30 x i32] [[TMP13]]), !continuation.registercount [[META26]], !continuation.returnedRegistercount [[META26]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP22]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP22]], 0
@@ -243,25 +240,25 @@ define void @Intersection() #3 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I]], label [[TMP16:%.*]], label [[TMP18:%.*]]
-; LOWERRAYTRACINGPIPELINE:       19:
+; LOWERRAYTRACINGPIPELINE:       18:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [2 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [2 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META26]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       22:
+; LOWERRAYTRACINGPIPELINE:       21:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [2 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [2 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META26]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @Intersection(
 ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META30:![0-9]+]] !continuation [[META31:![0-9]+]] !continuation.stacksize [[META25:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
-; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[TMP13]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP3]], ptr [[CSP]], align 4
@@ -308,14 +305,14 @@ define void @Intersection() #3 {
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 1
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[TMP13]], i32 0, i32 0, i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = call float @_cont_RayTMin(ptr [[TMP6]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[RES_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_HITDATA]] poison, float [[DOTFCA_0_1_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[RES_I_FCA_1_INSERT:%.*]] = insertvalue [[STRUCT_HITDATA]] [[RES_I_FCA_0_INSERT]], i32 [[DOTFCA_0_1_1_EXTRACT]], 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_HITDATA]] [[RES_I_FCA_1_INSERT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[RES_I_FCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [[STRUCT_HITDATA]] [[RES_I_FCA_1_INSERT]], 1
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[TMP13]], i32 0, i32 0, i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[TMP13]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[RESPTR_I:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA:%.*]], ptr [[TMP9]], i32 0, i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[RES_I:%.*]] = load [[STRUCT_HITDATA]], ptr [[RESPTR_I]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store [[STRUCT_HITDATA]] [[RES_I]], ptr [[TMP1]], align 4
@@ -382,10 +379,10 @@ define void @Intersection() #3 {
 define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !47 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @AnyHit(
 ; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META33:![0-9]+]] !continuation [[META34:![0-9]+]] !continuation.registercount [[META27:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = alloca [4 x i32], align 4
@@ -474,7 +471,7 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP57]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP56]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load [4 x i32], ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [2 x i32] poison, [4 x i32] [[TMP72]]), !continuation.registercount [[META27]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [2 x i32] poison, [4 x i32] [[TMP72]]), !continuation.registercount [[META27]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @AnyHit(
@@ -709,7 +706,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP51]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [14 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META27]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [14 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META27]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll
index d4288c5e5f..2a900dec45 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function ClosestHit --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
@@ -216,7 +216,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP97]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP98]], [17 x i32] poison, [10 x i32] [[TMP81]]), !continuation.registercount [[META20]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP98]], [17 x i32] poison, [10 x i32] [[TMP81]]), !continuation.registercount [[META20]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
@@ -231,11 +231,11 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP8:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP9:%.*]] = alloca [[STRUCT_HITDATA]], align 8
-; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP52:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[TMP52]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 2
@@ -247,32 +247,32 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 9
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT19:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP11]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_021_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT19]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = bitcast float [[DOTSROA_021_0_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_021_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT19]], i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = bitcast float [[DOTSROA_021_4_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[TMP52]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP15:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP14]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP15]], i8 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[TMP52]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP17:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP16]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP17]], i8 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP19:%.*]] = call <3 x float> @_cont_WorldRayOrigin3(ptr [[TMP18]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[C:%.*]] = extractelement <3 x float> [[TMP19]], i8 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP21:%.*]] = call <3 x float> @_cont_WorldRayDirection3(ptr [[TMP20]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[D:%.*]] = extractelement <3 x float> [[TMP21]], i8 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP23:%.*]] = call float @_cont_RayTMin(ptr [[TMP22]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP25:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP25:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT29:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP25]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP30:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP7]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT29]], ptr [[DOTFCA_0_GEP30]], align 4
@@ -280,10 +280,10 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_GEP32:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP7]], i32 0, i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[DOTFCA_1_EXTRACT31]], ptr [[DOTFCA_1_GEP32]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP26:%.*]] = call float @_cont_RayTCurrent(ptr [[TMP24]], ptr [[TMP7]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP28:%.*]] = call i32 @_cont_RayFlags(ptr [[TMP27]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP30:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP30:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT45:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP30]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP46:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP3]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT45]], ptr [[DOTFCA_0_GEP46]], align 4
@@ -291,8 +291,8 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_GEP48:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP3]], i32 0, i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[DOTFCA_1_EXTRACT47]], ptr [[DOTFCA_1_GEP48]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP31:%.*]] = call i32 @_cont_InstanceIndex(ptr [[TMP29]], ptr [[TMP3]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP33:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP33:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT41:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP33]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP42:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP4]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT41]], ptr [[DOTFCA_0_GEP42]], align 4
@@ -300,8 +300,8 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_GEP44:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP4]], i32 0, i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[DOTFCA_1_EXTRACT43]], ptr [[DOTFCA_1_GEP44]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP34:%.*]] = call i32 @_cont_InstanceID(ptr [[TMP32]], ptr [[TMP4]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP36:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP36:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT37:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP36]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP38:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP5]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT37]], ptr [[DOTFCA_0_GEP38]], align 4
@@ -309,8 +309,8 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_GEP40:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP5]], i32 0, i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[DOTFCA_1_EXTRACT39]], ptr [[DOTFCA_1_GEP40]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP37:%.*]] = call i32 @_cont_PrimitiveIndex(ptr [[TMP35]], ptr [[TMP5]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP39:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP39:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT25:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP39]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP26:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP8]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT25]], ptr [[DOTFCA_0_GEP26]], align 4
@@ -319,8 +319,8 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[DOTFCA_1_EXTRACT27]], ptr [[DOTFCA_1_GEP28]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP40:%.*]] = call <3 x float> @_cont_ObjectRayOrigin3(ptr [[TMP38]], ptr [[TMP8]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[K:%.*]] = extractelement <3 x float> [[TMP40]], i8 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP42:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP42:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT22:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP42]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP23:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP9]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT22]], ptr [[DOTFCA_0_GEP23]], align 4
@@ -329,8 +329,8 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[DOTFCA_1_EXTRACT24]], ptr [[DOTFCA_1_GEP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP43:%.*]] = call <3 x float> @_cont_ObjectRayDirection3(ptr [[TMP41]], ptr [[TMP9]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[L:%.*]] = extractelement <3 x float> [[TMP43]], i8 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP45:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP45:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT53:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP45]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP54:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP1]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT53]], ptr [[DOTFCA_0_GEP54]], align 4
@@ -343,8 +343,8 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [4 x <3 x float>] [[TMP46]], 2
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [4 x <3 x float>] [[TMP46]], 3
 ; DXILCONTPOSTPROCESS-NEXT:    [[M:%.*]] = extractelement <3 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP48:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP48:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT49:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP48]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP50:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP2]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT49]], ptr [[DOTFCA_0_GEP50]], align 4
@@ -357,16 +357,16 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT13:%.*]] = extractvalue [4 x <3 x float>] [[TMP49]], 2
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_EXTRACT14:%.*]] = extractvalue [4 x <3 x float>] [[TMP49]], 3
 ; DXILCONTPOSTPROCESS-NEXT:    [[N:%.*]] = extractelement <3 x float> [[DOTFCA_0_EXTRACT10]], i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP50:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP50:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT33:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP50]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP34:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP6]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store float [[DOTFCA_0_EXTRACT33]], ptr [[DOTFCA_0_GEP34]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_EXTRACT35:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP50]], 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_GEP36:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP6]], i32 0, i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[DOTFCA_1_EXTRACT35]], ptr [[DOTFCA_1_GEP36]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP51:%.*]] = call i32 @_cont_HitKind(ptr [[SYSTEM_DATA_ALLOCA]], ptr [[TMP6]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP52]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP51:%.*]] = call i32 @_cont_HitKind(ptr [[SYSTEM_DATA_ALLOCA1]], ptr [[TMP6]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA1]], i32 0, i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP53]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [10 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
index 1d5f3b5b9e..2ce27b76fe 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
@@ -4,9 +4,9 @@
 ; This results in a bit nicer result IR, containing less "spam" copying payload fields around.
 ; We also set a max hit attribute size ensuring there is no need for hit attribute storage in the payload.
 ; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
 ; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-CPS %s
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
@@ -97,6 +97,8 @@ declare !pointeetys !26 void @_AmdRestoreSystemData(%struct.DispatchSystemData*)
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !28 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #3
 
+declare !pointeetys !30 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #1 !pointeetys !30 {
   ret i32 5
@@ -192,14 +194,14 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META17]], !waitmask [[META20:![0-9]+]], !continuation.returnedRegistercount [[META17]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } @await.2(ptr [[TMP19]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa27i32a1i32s(i64 4, i32 8, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META17]], !waitmask [[META6:![0-9]+]], !continuation.returnedRegistercount [[META17]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP25]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SMALLPAYLOAD]] poison, ptr [[P1]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = freeze [[STRUCT_SMALLPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SMALLPAYLOAD]] [[TMP15]], ptr [[P1]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[P1]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP12]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP25]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP11]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -217,8 +219,8 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load ptr addrspace(32), ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_MEDIUMPAYLOAD]], ptr [[P2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP34]], ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP37]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr addrspace(32) [[TMP17]], align 4
@@ -227,11 +229,11 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP24]], ptr addrspace(32) [[TMP22]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[TMP29]]), !continuation.registercount [[META13:![0-9]+]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await.1(ptr [[TMP41]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa27i32a2i32s(i64 4, i32 8, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[TMP29]]), !continuation.registercount [[META13:![0-9]+]], !waitmask [[META6]], !continuation.returnedRegistercount [[META13]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP44]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [2 x i32] [[TMP60]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MEDIUMPAYLOAD]] poison, ptr [[P2]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = freeze [[STRUCT_MEDIUMPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MEDIUMPAYLOAD]] [[TMP34]], ptr [[P2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load ptr addrspace(32), ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_MEDIUMPAYLOAD]], ptr [[P2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
@@ -280,11 +282,11 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP52]], ptr addrspace(32) [[TMP50]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[TMP62]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await(ptr [[TMP63]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa27i32a2i32s(i64 4, i32 8, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[TMP62]]), !continuation.registercount [[META13]], !waitmask [[META6]], !continuation.returnedRegistercount [[META13]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP64]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [2 x i32] [[TMP65]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_LARGEPAYLOAD]] poison, ptr [[P3]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = freeze [[STRUCT_LARGEPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_LARGEPAYLOAD]] [[TMP71]], ptr [[P3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = load ptr addrspace(32), ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_LARGEPAYLOAD]], ptr [[P3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
@@ -317,7 +319,7 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP100]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP101]], [27 x i32] poison, [1 x i32] [[TMP95]]), !continuation.registercount [[META17]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP101]], [27 x i32] poison, [1 x i32] [[TMP95]]), !continuation.registercount [[META17]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -345,11 +347,10 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; CLEANUP-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0:[0-9]+]]
+; CLEANUP-NEXT:    [[ADDR_I:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.0)
 ; CLEANUP-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT15:%.*]] = insertvalue [1 x i32] poison, i32 0, 0
-; CLEANUP-NEXT:    [[TMP3:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP3]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT15]]), !continuation.registercount [[META17]], !waitmask [[META21:![0-9]+]], !continuation.returnedRegistercount [[META17]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT15]]), !continuation.registercount [[META17]], !waitmask [[META6:![0-9]+]], !continuation.returnedRegistercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -360,6 +361,8 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP1]], 2
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP2]], 0
+; CLEANUP-NEXT:    [[TMP9:%.*]] = freeze [[STRUCT_SMALLPAYLOAD:%.*]] poison
+; CLEANUP-NEXT:    [[DOTFCA_0_0_EXTRACT66:%.*]] = extractvalue [[STRUCT_SMALLPAYLOAD]] [[TMP9]], 0, 0
 ; CLEANUP-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP1]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT42:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -370,7 +373,7 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[DIS_DATA_I1_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT42]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I3:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I2]], 0
-; CLEANUP-NEXT:    [[ADDR_I4:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0]]
+; CLEANUP-NEXT:    [[ADDR_I4:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.1)
 ; CLEANUP-NEXT:    [[TRAV_DATA2_I5:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 [[ADDR_I4]], 5
 ; CLEANUP-NEXT:    [[TMP5:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(32)
@@ -379,8 +382,7 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP6]], align 4
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT19:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP5]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT19]], i32 0, 1
-; CLEANUP-NEXT:    [[TMP8:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.1)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP8]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13:![0-9]+]], !waitmask [[META21]], !continuation.returnedRegistercount [[META13]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13:![0-9]+]], !waitmask [[META6]], !continuation.returnedRegistercount [[META13]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -392,6 +394,10 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP1]], 2
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [2 x i32] [[TMP2]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP2]], 1
+; CLEANUP-NEXT:    [[TMP3:%.*]] = freeze [[STRUCT_MEDIUMPAYLOAD:%.*]] poison
+; CLEANUP-NEXT:    [[DOTFCA_0_0_EXTRACT60:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP3]], 0, 0
+; CLEANUP-NEXT:    [[DOTFCA_0_1_EXTRACT62:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP3]], 0, 1
+; CLEANUP-NEXT:    [[DOTFCA_0_2_EXTRACT64:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP3]], 0, 2
 ; CLEANUP-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT20]] to ptr addrspace(32)
 ; CLEANUP-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
 ; CLEANUP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 1
@@ -407,7 +413,7 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[DIS_DATA_I5_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT12]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I7:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I6]], 0
-; CLEANUP-NEXT:    [[ADDR_I9:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0]]
+; CLEANUP-NEXT:    [[ADDR_I9:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.2)
 ; CLEANUP-NEXT:    [[TRAV_DATA2_I10:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 [[ADDR_I9]], 5
 ; CLEANUP-NEXT:    [[TMP14:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(32)
@@ -420,8 +426,7 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP12]], align 4
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT25:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP14]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_1_INSERT28:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT25]], i32 0, 1
-; CLEANUP-NEXT:    [[TMP17:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.2)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP17]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT28]]), !continuation.registercount [[META13]], !waitmask [[META21]], !continuation.returnedRegistercount [[META13]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT28]]), !continuation.registercount [[META13]], !waitmask [[META6]], !continuation.returnedRegistercount [[META13]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -433,6 +438,12 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP1]], 2
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT29:%.*]] = extractvalue [2 x i32] [[TMP2]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_1_EXTRACT31:%.*]] = extractvalue [2 x i32] [[TMP2]], 1
+; CLEANUP-NEXT:    [[TMP3:%.*]] = freeze [[STRUCT_LARGEPAYLOAD:%.*]] poison
+; CLEANUP-NEXT:    [[DOTFCA_0_0_EXTRACT54:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP3]], 0, 0
+; CLEANUP-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP3]], 0, 1
+; CLEANUP-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP3]], 0, 2
+; CLEANUP-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP3]], 0, 3
+; CLEANUP-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP3]], 0, 4
 ; CLEANUP-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT29]] to ptr addrspace(32)
 ; CLEANUP-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(32) [[TMP4]], align 4
 ; CLEANUP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP4]], i32 1
@@ -452,7 +463,7 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT41:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT46]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 28)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT41]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT41]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -462,7 +473,7 @@ attributes #3 = { nounwind memory(none) }
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @Miss(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META13:![0-9]+]] !continuation [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META13:![0-9]+]] !continuation [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] !continuation.state [[META14:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -476,24 +487,26 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    [[T1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; CLEANUP-CPS-NEXT:    [[T2:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]])
 ; CLEANUP-CPS-NEXT:    [[T3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
+; CLEANUP-CPS-NEXT:    [[TMP2:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
 ; CLEANUP-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-CPS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.0)
 ; CLEANUP-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP1]], 5
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT13:%.*]] = insertvalue [1 x i32] poison, i32 0, 0
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP1]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT13]]), !continuation.registercount [[META19:![0-9]+]], !waitmask [[META20:![0-9]+]], !continuation.returnedRegistercount [[META19]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i32 poison, i64 [[TMP1]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT13]]), !waitmask [[META6:![0-9]+]], !continuation.returnedRegistercount [[META19:![0-9]+]], !continuation.registercount [[META19]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.registercount [[META19]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP5]], 0
+; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = freeze [[STRUCT_SMALLPAYLOAD:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT65:%.*]] = extractvalue [[STRUCT_SMALLPAYLOAD]] [[TMP12]], 0, 0
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT45:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -513,18 +526,22 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP10]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT17:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP8]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT17]], i32 0, 1
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP11]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i32 poison, i64 [[TMP11]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !waitmask [[META6]], !continuation.returnedRegistercount [[META13]], !continuation.registercount [[META13]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.1(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.registercount [[META13]] {
 ; CLEANUP-CPS-NEXT:  entryresume.1:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT18:%.*]] = extractvalue [2 x i32] [[TMP5]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP5]], 1
+; CLEANUP-CPS-NEXT:    [[TMP19:%.*]] = freeze [[STRUCT_MEDIUMPAYLOAD:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT59:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP19]], 0, 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT61:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP19]], 0, 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT63:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP19]], 0, 2
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT18]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP7]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 1
@@ -553,18 +570,24 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP17]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT23:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP13]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT26:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT23]], i32 0, 1
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP18]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT26]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i32 poison, i64 [[TMP18]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT26]]), !waitmask [[META6]], !continuation.returnedRegistercount [[META13]], !continuation.registercount [[META13]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.2(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.registercount [[META13]] {
 ; CLEANUP-CPS-NEXT:  entryresume.2:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT27:%.*]] = extractvalue [2 x i32] [[TMP5]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_EXTRACT29:%.*]] = extractvalue [2 x i32] [[TMP5]], 1
+; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = freeze [[STRUCT_LARGEPAYLOAD:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP16]], 0, 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP16]], 0, 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP16]], 0, 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP16]], 0, 3
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP16]], 0, 4
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT27]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP7]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 1
@@ -580,11 +603,11 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MISS_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 2
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(32) [[PAYLOAD_FCA_0_EXTRACT_RELOAD_ADDR]], align 4
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MISS_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
-; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RETURN_ADDR_RELOAD_ADDR]], align 4
+; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RETURN_ADDR_RELOAD_ADDR]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT44:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT49]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 24)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT44]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META19]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT44]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META19]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -636,6 +659,8 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP13]], -28
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP1]], 2
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP15:%.*]] = freeze [[STRUCT_SMALLPAYLOAD:%.*]] poison
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT66:%.*]] = extractvalue [[STRUCT_SMALLPAYLOAD]] [[TMP15]], 0, 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP1]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT42:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP14]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -672,6 +697,10 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP1]], 2
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue [2 x i32] [[TMP4]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP4]], 1
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = freeze [[STRUCT_MEDIUMPAYLOAD:%.*]] poison
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT60:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP14]], 0, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_1_EXTRACT62:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP14]], 0, 1
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT64:%.*]] = extractvalue [[STRUCT_MEDIUMPAYLOAD]] [[TMP14]], 0, 2
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(21) [[TMP6]], align 4
@@ -723,6 +752,12 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP1]], 2
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue [2 x i32] [[TMP4]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_EXTRACT31:%.*]] = extractvalue [2 x i32] [[TMP4]], 1
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP31:%.*]] = freeze [[STRUCT_LARGEPAYLOAD:%.*]] poison
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT54:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP31]], 0, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP31]], 0, 1
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP31]], 0, 2
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP31]], 0, 3
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_LARGEPAYLOAD]] [[TMP31]], 0, 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(21) [[TMP6]], align 4
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
index ed75c1e686..6660f93c4a 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata" \
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" \
 ; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" \
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" \
 ; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
 ; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
 ; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" \
@@ -129,11 +129,11 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP7]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } @await(ptr [[TMP11]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa10i32a1i32s(i64 2, i32 4, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP7]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP12]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = freeze [[STRUCT_MYPARAMS]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] [[TMP14]], ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
@@ -151,7 +151,7 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [10 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META15]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [10 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META15]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -172,29 +172,34 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT9]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i64 [[TMP2]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 2, i32 -1, {} poison, i32 poison, i64 [[TMP2]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META15]], !continuation.returnedRegistercount [[META15]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @called.resume.0(
 ; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META17]] !continuation.registercount [[META15]] !continuation [[META18]] {
 ; CLEANUP-NEXT:  entryresume.0:
+; CLEANUP-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CLEANUP-NEXT:    [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
+; CLEANUP-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP6]], align 4
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 2
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0
+; CLEANUP-NEXT:    [[TMP10:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP10]], 0
 ; CLEANUP-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
-; CLEANUP-NEXT:    [[TMP2:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; CLEANUP-NEXT:    [[TMP2:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP6]])
 ; CLEANUP-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP2]], i8 0
-; CLEANUP-NEXT:    [[TMP3:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.dimensions()
+; CLEANUP-NEXT:    [[TMP3:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP6]])
 ; CLEANUP-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP3]], i8 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META15]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META15]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -231,12 +236,14 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
-; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
+; POSTPROCESS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
+; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
 ; POSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP11]], -8
 ; POSTPROCESS-NEXT:    [[TMP12:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP12]], 0
+; POSTPROCESS-NEXT:    [[TMP17:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP17]], 0
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -278,10 +285,11 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP5]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa10i32a1i32s(i32 2, i32 4, i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META15:![0-9]+]], !continuation.returnedRegistercount [[META15]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa10i32a1i32s(i32 2, i32 4, i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [10 x i32] poison, [1 x i32] [[TMP6]]), !continuation.returnedRegistercount [[META15:![0-9]+]], !continuation.registercount [[META15]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP7]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP8]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP1]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = freeze [[STRUCT_MYPARAMS]] poison
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_MYPARAMS]] [[TMP18]], ptr [[TMP1]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
@@ -299,7 +307,7 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], [10 x i32] poison, [1 x i32] [[TMP17]]), !continuation.registercount [[META15]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], [10 x i32] poison, [1 x i32] [[TMP17]]), !continuation.registercount [[META15]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -309,7 +317,8 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @called(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] !continuation [[META19:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]])
+; CLEANUP-CPS-SAME: !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] !continuation [[META19:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] !continuation.state [[META13]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -320,33 +329,34 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META15:![0-9]+]], !continuation.returnedRegistercount [[META15]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i32 poison, i64 [[TMP0]], i32 5, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.returnedRegistercount [[META15:![0-9]+]], !continuation.registercount [[META15]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @called.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] !continuation [[META19]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] !continuation [[META19]] !continuation.registercount [[META15]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
-; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, align 8
-; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0
+; CLEANUP-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]], ptr [[TMP8]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP6]], 0
+; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP14]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[TMP5]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RETURN_ADDR_RELOAD_ADDR]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP8]])
 ; CLEANUP-CPS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP9]], i8 0
-; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP10]])
+; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP8]])
 ; CLEANUP-CPS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP11]], i8 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META15]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META15]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -356,7 +366,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @called(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META17:![0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !lgc.cps [[META19:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [10 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META17:![0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -380,25 +390,26 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-CPS-LABEL: define dso_local void @called.resume.0(
 ; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [10 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META17]] !lgc.rt.shaderstage [[META18]] !lgc.cps [[META19]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
-; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, align 8
+; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; POSTPROCESS-CPS-NEXT:    [[TMP20:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0
+; POSTPROCESS-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP20]], ptr [[TMP11]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], -8
 ; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP7]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP21]], 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] } [[TMP3]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP8]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(21)
 ; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[RETURN_ADDR_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP10]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP11]])
 ; POSTPROCESS-CPS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP12]], i8 0
-; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [10 x i32], [1 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP13]])
+; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[TMP11]])
 ; POSTPROCESS-CPS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP14]], i8 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
index 3bfff91b94..d9133ef009 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
@@ -66,6 +66,8 @@ declare !pointeetys !26 void @_AmdRestoreSystemData(%struct.DispatchSystemData*)
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !28 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #3
 
+declare !pointeetys !30 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #1 !pointeetys !30 {
   ret i32 5
@@ -176,7 +178,7 @@ attributes #3 = { nounwind memory(none) }
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP36:%.*]] = load [11 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP33]], [16 x i32] poison, [11 x i32] [[TMP36]]), !continuation.registercount [[META22]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP33]], [16 x i32] poison, [11 x i32] [[TMP36]]), !continuation.registercount [[META22]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
@@ -256,7 +258,7 @@ attributes #3 = { nounwind memory(none) }
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP45]], align 4
 ; CHECK-NEXT:    [[TMP53:%.*]] = load [14 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP46]], [16 x i32] poison, [14 x i32] [[TMP53]]), !continuation.registercount [[META19]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP46]], [16 x i32] poison, [14 x i32] [[TMP53]]), !continuation.registercount [[META19]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline.ll b/llvmraytracing/test/dx/lower-rt-pipeline.ll
index 54e4b0cdc6..21641d524f 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 ; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
 ; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-CPS %s
 ; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-CPS %s
 
@@ -535,11 +535,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load [4 x i32], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [4 x i32] [[TMP31]]), !continuation.registercount [[META34:![0-9]+]], !continuation.returnedRegistercount [[META34]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } @await(ptr [[TMP39]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa27i32a4i32s(i64 4, i32 8, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [4 x i32] [[TMP31]]), !continuation.registercount [[META34:![0-9]+]], !continuation.returnedRegistercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP41]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [4 x i32] [[TMP42]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = freeze [[STRUCT_RAYPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP40]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP37]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP22]], align 4
@@ -565,8 +565,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP48]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2
@@ -641,20 +641,20 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [4 x i32], ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [27 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [27 x i32] poison, [4 x i32] [[TMP45]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyAnyHitShader(
 ; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META43:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = alloca [4 x i32], align 4
@@ -668,9 +668,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP26]], align 4
@@ -684,10 +684,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP42]], ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP45]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP61]], ptr [[TMP44]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP44]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP43]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
@@ -735,8 +735,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP40]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP41]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP45]], ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4
@@ -747,20 +747,20 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr [[TMP49]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load i32, ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP63]], ptr [[TMP52]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP64]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP81]], ptr [[TMP80]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP60]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP61]], ptr [[TMP52]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP63]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP80]], ptr [[TMP65]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP53]], ptr [[TMP54]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [2 x i32] poison, [4 x i32] [[TMP62]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [2 x i32] poison, [4 x i32] [[TMP62]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       63:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -774,24 +774,24 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP69]], ptr [[TMP67]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = load i32, ptr [[TMP84]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP87]], ptr [[TMP82]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP84]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP81]], ptr [[TMP82]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP74]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP75]], ptr [[TMP88]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP103]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP87]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP104]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP103]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP79]], ptr [[TMP78]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP9]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP70]], ptr [[TMP71]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [2 x i32] poison, [4 x i32] [[TMP83]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [2 x i32] poison, [4 x i32] [[TMP83]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       84:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP37]], label [[TMP85:%.*]], label [[TMP128:%.*]]
@@ -801,12 +801,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP76]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP105]], ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP126]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP92]], ptr [[TMP125]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP77]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP104]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP125]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP92]], ptr [[TMP105]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP144]], align 4
@@ -826,14 +826,14 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP89]], ptr [[TMP90]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP145:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [2 x i32] poison, [4 x i32] [[TMP145]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [2 x i32] poison, [4 x i32] [[TMP145]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       107:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP93]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP146:%.*]] = load i32, ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP146]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP94]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP126]], ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4
@@ -849,15 +849,15 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP120]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP149:%.*]] = load i32, ptr [[TMP121]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP149]], ptr [[TMP148]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP148:%.*]] = load i32, ptr [[TMP121]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP148]], ptr [[TMP146]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP106:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP106]], ptr [[TMP107]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP127:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [2 x i32] poison, [4 x i32] [[TMP127]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [2 x i32] poison, [4 x i32] [[TMP127]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       128:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
@@ -888,7 +888,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP122]], ptr [[TMP123]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP124:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP147:%.*]] = load [4 x i32], ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -918,8 +918,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await.1(ptr [[TMP23]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa2i32a30i32s(i64 3, i32 16, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 0
@@ -943,16 +942,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I1]], label [[TMP19:%.*]], label [[TMP21:%.*]]
-; LOWERRAYTRACINGPIPELINE:       23:
+; LOWERRAYTRACINGPIPELINE:       22:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       26:
+; LOWERRAYTRACINGPIPELINE:       25:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -982,8 +981,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } @await.2(ptr [[TMP23]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa2i32a30i32s(i64 3, i32 16, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]], {} poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP26]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP24]], 0
@@ -1007,16 +1005,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I1]], label [[TMP19:%.*]], label [[TMP21:%.*]]
-; LOWERRAYTRACINGPIPELINE:       23:
+; LOWERRAYTRACINGPIPELINE:       22:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [2 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       26:
+; LOWERRAYTRACINGPIPELINE:       25:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -1064,7 +1062,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load [4 x i32], ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [27 x i32] poison, [4 x i32] [[TMP29]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [27 x i32] poison, [4 x i32] [[TMP29]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -1187,7 +1185,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa27i32a4i32s(i32 4, i32 8, i32 5, [30 x i32] poison, [4 x i32] [[TMP21]]), !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP22]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [4 x i32] [[TMP23]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = freeze [[STRUCT_RAYPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP27]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP12]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
@@ -1213,8 +1212,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP38]], i8 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP48]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP36]], i64 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP36]], i64 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP36]], i64 2
@@ -1288,20 +1287,20 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP41]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = load [4 x i32], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP42]], [27 x i32] poison, [4 x i32] [[TMP43]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP42]], [27 x i32] poison, [4 x i32] [[TMP43]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader(
 ; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META34]] !continuation [[META44:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = alloca [4 x i32], align 4
@@ -1313,10 +1312,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP13]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP16]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP20]], align 4
@@ -1328,12 +1327,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP22]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP25]], ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP28]], ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP28]], ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP27]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[HIT_ATTRS]], ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
@@ -1381,12 +1380,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP41]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP43]], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP42]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP56]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP45]], ptr [[TMP56]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP45]], ptr [[TMP43]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4
@@ -1406,7 +1405,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP57]], ptr [[TMP58]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP59:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP59]], [2 x i32] poison, [4 x i32] [[TMP60]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP59]], [2 x i32] poison, [4 x i32] [[TMP60]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       60:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -1437,7 +1436,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP78]], ptr [[TMP79]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP80:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP81:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP80]], [2 x i32] poison, [4 x i32] [[TMP81]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP80]], [2 x i32] poison, [4 x i32] [[TMP81]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       81:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP38]], label [[TMP84:%.*]], label [[TMP141:%.*]]
@@ -1472,7 +1471,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP102]], ptr [[TMP103]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP104:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP106:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [2 x i32] poison, [4 x i32] [[TMP106]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [2 x i32] poison, [4 x i32] [[TMP106]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       104:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -1503,7 +1502,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP124]], ptr [[TMP125]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP126:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP142:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP126]], [2 x i32] poison, [4 x i32] [[TMP142]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP126]], [2 x i32] poison, [4 x i32] [[TMP142]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       125:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP30]], ptr [[TMP29]], align 4
@@ -1534,7 +1533,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP144]], ptr [[TMP145]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP146:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP147:%.*]] = load [4 x i32], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP146]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP146]], [2 x i32] poison, [4 x i32] [[TMP147]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -1591,13 +1590,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       21:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       24:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -1654,13 +1653,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       21:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [2 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       24:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [2 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -1707,7 +1706,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [4 x i32], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [27 x i32] poison, [4 x i32] [[TMP27]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [27 x i32] poison, [4 x i32] [[TMP27]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -1837,8 +1836,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue [4 x i32] [[TMP16]], 1
 ; POSTPROCESS-NEXT:    [[TMP5:%.*]] = extractvalue [4 x i32] [[TMP16]], 2
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = extractvalue [4 x i32] [[TMP16]], 3
+; POSTPROCESS-NEXT:    [[TMP20:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP20]], 0
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP10]] to float
-; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
+; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTFCA_0_EXTRACT]], float [[TMP2]], i32 0
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_0_VEC_INSERT]], float [[TMP4]], i32 1
 ; POSTPROCESS-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP5]] to float
@@ -1867,16 +1868,14 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[VAL_1_I:%.*]] = insertelement <3 x i32> [[VAL_0_I]], i32 [[RES_2_I]], i32 1
 ; POSTPROCESS-NEXT:    [[VAL_2_I:%.*]] = insertelement <3 x i32> [[VAL_1_I]], i32 [[RES_3_I]], i32 2
 ; POSTPROCESS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[VAL_2_I]], i8 1
-; POSTPROCESS-NEXT:    [[TMP20:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP18]])
-; POSTPROCESS-NEXT:    [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP20]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; POSTPROCESS-NEXT:    [[TMP21:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP18]])
+; POSTPROCESS-NEXT:    [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP21]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; POSTPROCESS-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; POSTPROCESS-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
 ; POSTPROCESS-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 3
 ; POSTPROCESS-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP11]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP12]], float [[TMP13]], float [[TMP14]], float [[TMP15]], i8 15)
 ; POSTPROCESS-NEXT:    ret void
-; POSTPROCESS:       entryresume.0.split:
-; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyClosestHitShader(
@@ -3343,7 +3342,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyRayGen(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -3369,22 +3368,25 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP8]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP9]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP10]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 8, {} poison, i64 [[TMP6]], i32 5, [30 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 8, {} poison, i32 poison, i64 [[TMP6]], i32 5, [30 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] !continuation.registercount [[META34]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
-; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, align 8
-; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 0
+; CLEANUP-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP12]], ptr [[TMP4]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 3
+; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP14]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float
-; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0
+; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTFCA_0_EXTRACT1]], float [[TMP6]], i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_0_VEC_INSERT]], float [[TMP7]], i32 1
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = bitcast i32 [[DOTFCA_8_EXTRACT]] to float
@@ -3395,28 +3397,26 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP10]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; CLEANUP-CPS-NEXT:    [[RES_1_I1:%.*]] = load i32, ptr [[TMP12]], align 4
-; CLEANUP-CPS-NEXT:    [[RESPTR_2_I2:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 1
+; CLEANUP-CPS-NEXT:    [[RES_1_I1:%.*]] = load i32, ptr [[TMP4]], align 4
+; CLEANUP-CPS-NEXT:    [[RESPTR_2_I2:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP4]], i32 0, i32 0, i32 1
 ; CLEANUP-CPS-NEXT:    [[RES_2_I3:%.*]] = load i32, ptr [[RESPTR_2_I2]], align 4
-; CLEANUP-CPS-NEXT:    [[RESPTR_3_I4:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 2
+; CLEANUP-CPS-NEXT:    [[RESPTR_3_I4:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP4]], i32 0, i32 0, i32 2
 ; CLEANUP-CPS-NEXT:    [[RES_3_I5:%.*]] = load i32, ptr [[RESPTR_3_I4]], align 4
 ; CLEANUP-CPS-NEXT:    [[VAL_0_I6:%.*]] = insertelement <3 x i32> undef, i32 [[RES_1_I1]], i32 0
 ; CLEANUP-CPS-NEXT:    [[VAL_1_I7:%.*]] = insertelement <3 x i32> [[VAL_0_I6]], i32 [[RES_2_I3]], i32 1
 ; CLEANUP-CPS-NEXT:    [[VAL_2_I8:%.*]] = insertelement <3 x i32> [[VAL_1_I7]], i32 [[RES_3_I5]], i32 2
 ; CLEANUP-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[VAL_2_I8]], i8 0
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; CLEANUP-CPS-NEXT:    [[RES_1_I:%.*]] = load i32, ptr [[TMP13]], align 4
-; CLEANUP-CPS-NEXT:    [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 1
+; CLEANUP-CPS-NEXT:    [[RES_1_I:%.*]] = load i32, ptr [[TMP4]], align 4
+; CLEANUP-CPS-NEXT:    [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP4]], i32 0, i32 0, i32 1
 ; CLEANUP-CPS-NEXT:    [[RES_2_I:%.*]] = load i32, ptr [[RESPTR_2_I]], align 4
-; CLEANUP-CPS-NEXT:    [[RESPTR_3_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 2
+; CLEANUP-CPS-NEXT:    [[RESPTR_3_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP4]], i32 0, i32 0, i32 2
 ; CLEANUP-CPS-NEXT:    [[RES_3_I:%.*]] = load i32, ptr [[RESPTR_3_I]], align 4
 ; CLEANUP-CPS-NEXT:    [[VAL_0_I:%.*]] = insertelement <3 x i32> undef, i32 [[RES_1_I]], i32 0
 ; CLEANUP-CPS-NEXT:    [[VAL_1_I:%.*]] = insertelement <3 x i32> [[VAL_0_I]], i32 [[RES_2_I]], i32 1
 ; CLEANUP-CPS-NEXT:    [[VAL_2_I:%.*]] = insertelement <3 x i32> [[VAL_1_I]], i32 [[RES_3_I]], i32 2
 ; CLEANUP-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[VAL_2_I]], i8 1
-; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
-; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP14]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
+; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP13]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; CLEANUP-CPS-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
@@ -3426,7 +3426,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyClosestHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
@@ -3479,12 +3479,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP18]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP19]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP20]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyAnyHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META34]] !continuation [[META41:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], {} [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META34]] !continuation [[META41:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
@@ -3687,7 +3687,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP20]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP21]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP22]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       28:
 ; CLEANUP-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -3756,7 +3756,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT65:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT62]], i32 [[TMP31]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT68:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT65]], i32 [[TMP32]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT71:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT68]], i32 [[TMP33]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT264]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT264]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT71]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       39:
 ; CLEANUP-CPS-NEXT:    br i1 [[TMP15]], label [[TMP40:%.*]], label [[TMP59:%.*]]
@@ -3825,7 +3825,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT77:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT74]], i32 [[TMP44]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT80:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT77]], i32 [[TMP45]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT83:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT80]], i32 [[TMP46]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT305]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT305]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT83]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       50:
 ; CLEANUP-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -3890,7 +3890,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT89:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT86]], i32 [[TMP53]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT92:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT89]], i32 [[TMP54]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT95:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT92]], i32 [[TMP55]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT346]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT346]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT95]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       59:
 ; CLEANUP-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
@@ -3958,12 +3958,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT101:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT98]], i32 [[TMP61]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT104:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT101]], i32 [[TMP62]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT107:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT104]], i32 [[TMP63]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT387]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT387]], [2 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT107]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]]
+; CLEANUP-CPS-SAME: !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META32:![0-9]+]] !continuation.state [[META32]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4072,7 +4073,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i32 poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       isEnd.i:
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -4135,7 +4136,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       9:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -4182,12 +4183,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META43]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META43]] !continuation.registercount [[META33]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2
@@ -4285,7 +4286,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4334,12 +4335,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader2(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META32]] !continuation.state [[META32]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4448,7 +4449,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader2.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i32 poison, i64 [[TMP2]], i32 5, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], {} poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       isEnd.i:
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -4511,7 +4512,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       9:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -4558,12 +4559,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader2.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [2 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44]] !continuation.registercount [[META33]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [2 x i32], [30 x i32] } [[TMP3]], 2
@@ -4661,7 +4662,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4710,12 +4711,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [2 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyMissShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META39]] !continuation [[META45:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [4 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META34]] !lgc.cps [[META39]] !continuation [[META45:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 1
@@ -4744,7 +4745,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[TMP5]], 1
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[TMP6]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_2_INSERT]], i32 [[TMP7]], 3
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [27 x i32] poison, [4 x i32] [[DOTFCA_3_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -4864,17 +4865,20 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
 ; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [4 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
-; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, align 8
+; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 0
+; POSTPROCESS-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP4]], ptr [[TMP12]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] } [[TMP3]], 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 1
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [4 x i32] [[TMP5]], 3
+; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP14]], 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float
-; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0
+; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTFCA_0_EXTRACT1]], float [[TMP6]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_0_VEC_INSERT]], float [[TMP7]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = bitcast i32 [[DOTFCA_8_EXTRACT]] to float
@@ -4885,7 +4889,6 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP10]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; POSTPROCESS-CPS-NEXT:    [[RES_1_I1:%.*]] = load i32, ptr [[TMP12]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[RESPTR_2_I2:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 1
 ; POSTPROCESS-CPS-NEXT:    [[RES_2_I3:%.*]] = load i32, ptr [[RESPTR_2_I2]], align 4
@@ -4895,18 +4898,17 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[VAL_1_I7:%.*]] = insertelement <3 x i32> [[VAL_0_I6]], i32 [[RES_2_I3]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[VAL_2_I8:%.*]] = insertelement <3 x i32> [[VAL_1_I7]], i32 [[RES_3_I5]], i32 2
 ; POSTPROCESS-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[VAL_2_I8]], i8 0
-; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [4 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; POSTPROCESS-CPS-NEXT:    [[RES_1_I:%.*]] = load i32, ptr [[TMP13]], align 4
-; POSTPROCESS-CPS-NEXT:    [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 1
+; POSTPROCESS-CPS-NEXT:    [[RES_1_I:%.*]] = load i32, ptr [[TMP12]], align 4
+; POSTPROCESS-CPS-NEXT:    [[RESPTR_2_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 1
 ; POSTPROCESS-CPS-NEXT:    [[RES_2_I:%.*]] = load i32, ptr [[RESPTR_2_I]], align 4
-; POSTPROCESS-CPS-NEXT:    [[RESPTR_3_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP13]], i32 0, i32 0, i32 2
+; POSTPROCESS-CPS-NEXT:    [[RESPTR_3_I:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP12]], i32 0, i32 0, i32 2
 ; POSTPROCESS-CPS-NEXT:    [[RES_3_I:%.*]] = load i32, ptr [[RESPTR_3_I]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[VAL_0_I:%.*]] = insertelement <3 x i32> undef, i32 [[RES_1_I]], i32 0
 ; POSTPROCESS-CPS-NEXT:    [[VAL_1_I:%.*]] = insertelement <3 x i32> [[VAL_0_I]], i32 [[RES_2_I]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[VAL_2_I:%.*]] = insertelement <3 x i32> [[VAL_1_I]], i32 [[RES_3_I]], i32 2
 ; POSTPROCESS-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[VAL_2_I]], i8 1
-; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
-; POSTPROCESS-CPS-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP14]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
+; POSTPROCESS-CPS-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP13]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; POSTPROCESS-CPS-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
@@ -5469,7 +5471,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META32:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5872,7 +5874,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader2(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [2 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META32]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
diff --git a/llvmraytracing/test/dx/paq-hit-attribute-size.ll b/llvmraytracing/test/dx/paq-hit-attribute-size.ll
index 75389981e7..e012e1c201 100644
--- a/llvmraytracing/test/dx/paq-hit-attribute-size.ll
+++ b/llvmraytracing/test/dx/paq-hit-attribute-size.ll
@@ -104,7 +104,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord
 ; CHECK-MAX-1-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP34]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP33]])
 ; CHECK-MAX-1-NEXT:    [[TMP35:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-1-NEXT:    [[TMP36:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-1-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META15]]
+; CHECK-MAX-1-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META15]]
 ; CHECK-MAX-1-NEXT:    unreachable
 ;
 ; CHECK-MAX-2-LABEL: define %struct.AnyHitSystemData @AnyHit1DWords(
@@ -164,7 +164,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord
 ; CHECK-MAX-2-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP34]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP33]])
 ; CHECK-MAX-2-NEXT:    [[TMP35:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-2-NEXT:    [[TMP36:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META18]]
+; CHECK-MAX-2-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META18]]
 ; CHECK-MAX-2-NEXT:    unreachable
 ;
 ; CHECK-MAX-4-LABEL: define %struct.AnyHitSystemData @AnyHit1DWords(
@@ -225,7 +225,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord
 ; CHECK-MAX-4-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP35]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP34]])
 ; CHECK-MAX-4-NEXT:    [[TMP36:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-4-NEXT:    [[TMP37:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [4 x i32] poison, [6 x i32] [[TMP37]]), !continuation.registercount [[META19]]
+; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [4 x i32] poison, [6 x i32] [[TMP37]]), !continuation.registercount [[META19]]
 ; CHECK-MAX-4-NEXT:    unreachable
 ;
 ; CHECK-MAX-8-LABEL: define %struct.AnyHitSystemData @AnyHit1DWords(
@@ -286,7 +286,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord
 ; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP35]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP34]])
 ; CHECK-MAX-8-NEXT:    [[TMP36:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-8-NEXT:    [[TMP37:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [8 x i32] poison, [10 x i32] [[TMP37]]), !continuation.registercount [[META20]]
+; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [8 x i32] poison, [10 x i32] [[TMP37]]), !continuation.registercount [[META20]]
 ; CHECK-MAX-8-NEXT:    unreachable
 ;
   ret void
@@ -362,7 +362,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord
 ; CHECK-MAX-2-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP40]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP39]])
 ; CHECK-MAX-2-NEXT:    [[TMP41:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-2-NEXT:    [[TMP42:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-2-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP41]], [2 x i32] poison, [4 x i32] [[TMP42]]), !continuation.registercount [[META18]]
+; CHECK-MAX-2-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP41]], [2 x i32] poison, [4 x i32] [[TMP42]]), !continuation.registercount [[META18]]
 ; CHECK-MAX-2-NEXT:    unreachable
 ;
 ; CHECK-MAX-4-LABEL: define %struct.AnyHitSystemData @AnyHit2DWords(
@@ -431,7 +431,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord
 ; CHECK-MAX-4-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP41]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP40]])
 ; CHECK-MAX-4-NEXT:    [[TMP42:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-4-NEXT:    [[TMP43:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [4 x i32] poison, [6 x i32] [[TMP43]]), !continuation.registercount [[META19]]
+; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [4 x i32] poison, [6 x i32] [[TMP43]]), !continuation.registercount [[META19]]
 ; CHECK-MAX-4-NEXT:    unreachable
 ;
 ; CHECK-MAX-8-LABEL: define %struct.AnyHitSystemData @AnyHit2DWords(
@@ -500,7 +500,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord
 ; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP41]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP40]])
 ; CHECK-MAX-8-NEXT:    [[TMP42:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-8-NEXT:    [[TMP43:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [8 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META20]]
+; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [8 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META20]]
 ; CHECK-MAX-8-NEXT:    unreachable
 ;
   ret void
@@ -596,7 +596,7 @@ define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWord
 ; CHECK-MAX-4-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP52]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP51]])
 ; CHECK-MAX-4-NEXT:    [[TMP53:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-4-NEXT:    [[TMP54:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [4 x i32] poison, [6 x i32] [[TMP54]]), !continuation.registercount [[META19]]
+; CHECK-MAX-4-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [4 x i32] poison, [6 x i32] [[TMP54]]), !continuation.registercount [[META19]]
 ; CHECK-MAX-4-NEXT:    unreachable
 ;
 ; CHECK-MAX-8-LABEL: define %struct.AnyHitSystemData @AnyHit4DWords(
@@ -680,7 +680,7 @@ define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWord
 ; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP52]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP51]])
 ; CHECK-MAX-8-NEXT:    [[TMP53:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-8-NEXT:    [[TMP54:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [8 x i32] poison, [10 x i32] [[TMP54]]), !continuation.registercount [[META20]]
+; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [8 x i32] poison, [10 x i32] [[TMP54]]), !continuation.registercount [[META20]]
 ; CHECK-MAX-8-NEXT:    unreachable
 ;
   ret void
@@ -812,7 +812,7 @@ define void @AnyHit8DWords(%struct.MyPayload* %payload, %struct.Attributes8DWord
 ; CHECK-MAX-8-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP76]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP75]])
 ; CHECK-MAX-8-NEXT:    [[TMP77:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-MAX-8-NEXT:    [[TMP78:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP77]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META20]]
+; CHECK-MAX-8-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP77]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META20]]
 ; CHECK-MAX-8-NEXT:    unreachable
 ;
   ret void
@@ -864,6 +864,8 @@ declare !pointeetys !42 i1 @_cont_ReportHit(%struct.AnyHitSystemData*, float, i3
 
 declare !pointeetys !44 void @_cont_AcceptHit(%struct.AnyHitSystemData*)
 
+declare !pointeetys !45 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #3 !pointeetys !45 {
 ; CHECK-LABEL: define i32 @_cont_GetLocalRootIndex(
diff --git a/llvmraytracing/test/dx/payload-caller-in-paq.ll b/llvmraytracing/test/dx/payload-caller-in-paq.ll
index 4bf8d2575f..9f965da17f 100644
--- a/llvmraytracing/test/dx/payload-caller-in-paq.ll
+++ b/llvmraytracing/test/dx/payload-caller-in-paq.ll
@@ -58,11 +58,11 @@ define void @RayGen() #0 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP12]]), !continuation.registercount [[META32:![0-9]+]], !waitmask [[META33:![0-9]+]], !continuation.returnedRegistercount [[META25:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [12 x i32], [3 x i32] } @await(ptr [[TMP17]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [12 x i32], [3 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa12i32a3i32s(i64 4, i32 8, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP12]]), !continuation.registercount [[META32:![0-9]+]], !waitmask [[META13]], !continuation.returnedRegistercount [[META25:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [12 x i32], [3 x i32] } [[TMP20]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [3 x i32] [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = freeze [[STRUCT_MYPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPAYLOAD]] [[TMP27]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP14]], align 4
@@ -85,10 +85,10 @@ define void @RayGen() #0 {
 ; LOWERRAYTRACINGPIPELINE:       .split:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load float, ptr [[TMP6]], align 8, !tbaa [[TBAA28]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA34:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA33:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = sitofp i32 [[TMP44]] to float
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8, !tbaa [[TBAA36:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8, !tbaa [[TBAA35:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = fptrunc double [[TMP47]] to float
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP49]], i8 0
diff --git a/llvmraytracing/test/dx/payload-save-registers.ll b/llvmraytracing/test/dx/payload-save-registers.ll
index 1107306d2a..14aba15cc7 100644
--- a/llvmraytracing/test/dx/payload-save-registers.ll
+++ b/llvmraytracing/test/dx/payload-save-registers.ll
@@ -147,14 +147,14 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = load i32, ptr [[TMP57]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP119]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [4 x i32] poison, [1 x i32] [[TMP79]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } @await(ptr [[TMP122]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP125:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa4i32a1i32s(i64 4, i32 8, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [4 x i32] poison, [1 x i32] [[TMP79]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [1 x i32] [[TMP61]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_INNERPAYLOAD]] poison, ptr [[TMP46]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = freeze [[STRUCT_INNERPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_INNERPAYLOAD]] [[TMP117]], ptr [[TMP46]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP128:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP128]], ptr [[TMP59]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP122]], ptr [[TMP59]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [1 x i32] } [[TMP125]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP58]], ptr [[TMP54]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -198,8 +198,8 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP114]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = load i32, ptr [[TMP100]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP117]], ptr [[TMP137]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP128:%.*]] = load i32, ptr [[TMP100]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP128]], ptr [[TMP137]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP103]], align 4
@@ -255,7 +255,7 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP94]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP143:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [4 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META24]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [4 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META24]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
@@ -646,11 +646,11 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP488:%.*]] = load i32, ptr [[TMP270]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP488]], ptr [[TMP269]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP272:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP489:%.*]] = call ptr inttoptr (i64 2 to ptr)(i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [4 x i32] poison, [30 x i32] [[TMP272]]), !continuation.registercount [[META24]], !continuation.returnedRegistercount [[META24]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP274:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } @await.1(ptr [[TMP489]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP274:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa4i32a30i32s(i64 2, i32 4, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], [4 x i32] poison, [30 x i32] [[TMP272]]), !continuation.registercount [[META24]], !continuation.returnedRegistercount [[META24]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP490:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [4 x i32], [30 x i32] } [[TMP274]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP490]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_OUTERPAYLOAD]] poison, ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP358:%.*]] = freeze [[STRUCT_OUTERPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_OUTERPAYLOAD]] [[TMP358]], ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP277:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP277]], ptr [[TMP224]], align 4
@@ -760,8 +760,8 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP355]], ptr [[TMP268]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP271:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 27
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP357:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 27
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP358:%.*]] = load i32, ptr [[TMP357]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP358]], ptr [[TMP271]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP489:%.*]] = load i32, ptr [[TMP357]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP489]], ptr [[TMP271]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP273:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 28
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP360:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 28
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP361:%.*]] = load i32, ptr [[TMP360]], align 4
@@ -956,7 +956,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP484]], ptr [[TMP482]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP382:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP486:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [4 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META24]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [4 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META24]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %1 = alloca %struct.OuterPayload, align 8
@@ -1180,6 +1180,8 @@ declare !pointeetys !39 void @_AmdRestoreSystemData(%struct.DispatchSystemData*)
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !41 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
 
+declare !pointeetys !43 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #3 !pointeetys !43 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
diff --git a/llvmraytracing/test/dx/payload.ll b/llvmraytracing/test/dx/payload.ll
index cc424b6621..9a487a289c 100644
--- a/llvmraytracing/test/dx/payload.ll
+++ b/llvmraytracing/test/dx/payload.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | grep -v lgc.cps.module | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
-; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | grep -v lgc.cps.module | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
+; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | grep -v lgc.cps.module | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
+; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | grep -v lgc.cps.module | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
 ; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS %s
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
 ; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS-GLOBAL %s
 ; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-CPS %s
 ; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
@@ -68,6 +68,7 @@ declare i64 @_AmdGetResumePointAddr() #3
 
 ; Function Attrs: nounwind memory(none)
 declare !pointeetys !26 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #2
+declare !pointeetys !28 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !pointeetys !28 {
@@ -219,7 +220,7 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT56]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; CLEANUP-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
+; CLEANUP-NEXT:    [[ADDR_I:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
 ; CLEANUP-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
 ; CLEANUP-NEXT:    [[TMP6:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr addrspace(32)
@@ -306,8 +307,7 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 undef, 27
 ; CLEANUP-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 undef, 28
 ; CLEANUP-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 undef, 29
-; CLEANUP-NEXT:    [[TMP34:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP34]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17:![0-9]+]], !waitmask [[META22:![0-9]+]], !continuation.returnedRegistercount [[META17]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17:![0-9]+]], !waitmask [[META19]], !continuation.returnedRegistercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -347,72 +347,121 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP3]], 27
 ; CLEANUP-NEXT:    [[TMP23:%.*]] = extractvalue [30 x i32] [[TMP3]], 28
 ; CLEANUP-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP3]], 29
+; CLEANUP-NEXT:    [[TMP4:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; CLEANUP-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 0
+; CLEANUP-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 1
+; CLEANUP-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 2
+; CLEANUP-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 3
+; CLEANUP-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 4
+; CLEANUP-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 5
+; CLEANUP-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 6
+; CLEANUP-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 7
+; CLEANUP-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 8
+; CLEANUP-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 9
+; CLEANUP-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 10
+; CLEANUP-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 11
+; CLEANUP-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 12
+; CLEANUP-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 13
+; CLEANUP-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 14
+; CLEANUP-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 15
+; CLEANUP-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 16
+; CLEANUP-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 17
+; CLEANUP-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 18
+; CLEANUP-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 19
+; CLEANUP-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 20
+; CLEANUP-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 21
+; CLEANUP-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 22
+; CLEANUP-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 23
+; CLEANUP-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 24
+; CLEANUP-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 25
+; CLEANUP-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 26
+; CLEANUP-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 27
+; CLEANUP-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 28
+; CLEANUP-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 29
+; CLEANUP-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 30
+; CLEANUP-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 31
+; CLEANUP-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 32
+; CLEANUP-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 33
+; CLEANUP-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 34
+; CLEANUP-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 35
+; CLEANUP-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 36
+; CLEANUP-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 37
+; CLEANUP-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 38
+; CLEANUP-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 39
+; CLEANUP-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 40
+; CLEANUP-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 41
+; CLEANUP-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 42
+; CLEANUP-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 43
+; CLEANUP-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 44
+; CLEANUP-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 45
+; CLEANUP-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 46
+; CLEANUP-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 47
+; CLEANUP-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 48
+; CLEANUP-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 49
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32)
-; CLEANUP-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4
+; CLEANUP-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4
 ; CLEANUP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 1
-; CLEANUP-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(32) [[TMP27]], align 4
+; CLEANUP-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP27]], align 4
 ; CLEANUP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 2
-; CLEANUP-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP29]], align 4
+; CLEANUP-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(32) [[TMP29]], align 4
 ; CLEANUP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 3
-; CLEANUP-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(32) [[TMP31]], align 4
+; CLEANUP-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(32) [[TMP31]], align 4
 ; CLEANUP-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 4
-; CLEANUP-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(32) [[TMP33]], align 4
+; CLEANUP-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(32) [[TMP33]], align 4
 ; CLEANUP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 5
-; CLEANUP-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(32) [[TMP35]], align 4
+; CLEANUP-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(32) [[TMP35]], align 4
 ; CLEANUP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 6
-; CLEANUP-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(32) [[TMP37]], align 4
+; CLEANUP-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(32) [[TMP37]], align 4
 ; CLEANUP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 7
-; CLEANUP-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(32) [[TMP39]], align 4
+; CLEANUP-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(32) [[TMP39]], align 4
 ; CLEANUP-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 8
-; CLEANUP-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(32) [[TMP41]], align 4
+; CLEANUP-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(32) [[TMP41]], align 4
 ; CLEANUP-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 9
-; CLEANUP-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(32) [[TMP43]], align 4
+; CLEANUP-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(32) [[TMP43]], align 4
 ; CLEANUP-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 10
-; CLEANUP-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(32) [[TMP45]], align 4
+; CLEANUP-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(32) [[TMP45]], align 4
 ; CLEANUP-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 11
-; CLEANUP-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(32) [[TMP47]], align 4
+; CLEANUP-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(32) [[TMP47]], align 4
 ; CLEANUP-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 12
-; CLEANUP-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(32) [[TMP49]], align 4
+; CLEANUP-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(32) [[TMP49]], align 4
 ; CLEANUP-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 13
-; CLEANUP-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(32) [[TMP51]], align 4
+; CLEANUP-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(32) [[TMP51]], align 4
 ; CLEANUP-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 14
-; CLEANUP-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(32) [[TMP53]], align 4
+; CLEANUP-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(32) [[TMP53]], align 4
 ; CLEANUP-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 15
-; CLEANUP-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(32) [[TMP55]], align 4
+; CLEANUP-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(32) [[TMP55]], align 4
 ; CLEANUP-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 16
-; CLEANUP-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(32) [[TMP57]], align 4
+; CLEANUP-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(32) [[TMP57]], align 4
 ; CLEANUP-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 17
-; CLEANUP-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(32) [[TMP59]], align 4
+; CLEANUP-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(32) [[TMP59]], align 4
 ; CLEANUP-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 18
-; CLEANUP-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(32) [[TMP61]], align 4
+; CLEANUP-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(32) [[TMP61]], align 4
 ; CLEANUP-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 19
-; CLEANUP-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(32) [[TMP63]], align 4
+; CLEANUP-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(32) [[TMP63]], align 4
 ; CLEANUP-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 20
-; CLEANUP-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(32) [[TMP65]], align 4
+; CLEANUP-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(32) [[TMP65]], align 4
 ; CLEANUP-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 21
-; CLEANUP-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(32) [[TMP67]], align 4
+; CLEANUP-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(32) [[TMP67]], align 4
 ; CLEANUP-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 22
-; CLEANUP-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(32) [[TMP69]], align 4
+; CLEANUP-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(32) [[TMP69]], align 4
 ; CLEANUP-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 23
-; CLEANUP-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(32) [[TMP71]], align 4
+; CLEANUP-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(32) [[TMP71]], align 4
 ; CLEANUP-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 24
-; CLEANUP-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(32) [[TMP73]], align 4
+; CLEANUP-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(32) [[TMP73]], align 4
 ; CLEANUP-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 25
-; CLEANUP-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(32) [[TMP75]], align 4
+; CLEANUP-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(32) [[TMP75]], align 4
 ; CLEANUP-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 26
-; CLEANUP-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(32) [[TMP77]], align 4
+; CLEANUP-NEXT:    [[TMP62:%.*]] = load i32, ptr addrspace(32) [[TMP77]], align 4
 ; CLEANUP-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32)
 ; CLEANUP-NEXT:    [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP1]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP58]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 108)
 ; CLEANUP-NEXT:    ret void
-; CLEANUP:       entryresume.0.split:
-; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define void @AnyHit(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.registercount [[META17]] !continuation [[META24:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.registercount [[META17]] !continuation [[META23:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; CLEANUP-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
@@ -668,12 +717,12 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; CLEANUP-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define void @ClosestHit(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META17]] !continuation [[META26:![0-9]+]] !continuation.stacksize [[META27:![0-9]+]] !continuation.state [[META28:![0-9]+]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META24:![0-9]+]] !continuation.registercount [[META17]] !continuation [[META25:![0-9]+]] !continuation.stacksize [[META26:![0-9]+]] !continuation.state [[META27:![0-9]+]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 120)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -782,7 +831,7 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; CLEANUP-NEXT:    [[ADDR_I1:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3]]
+; CLEANUP-NEXT:    [[ADDR_I1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @ClosestHit.resume.0)
 ; CLEANUP-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I1]], 5
 ; CLEANUP-NEXT:    [[TMP88:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-NEXT:    [[TMP89:%.*]] = inttoptr i32 [[TMP88]] to ptr addrspace(32)
@@ -869,13 +918,12 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_27_INSERT135:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT132]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; CLEANUP-NEXT:    [[DOTFCA_28_INSERT138:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT135]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-NEXT:    [[DOTFCA_29_INSERT141:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT138]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
-; CLEANUP-NEXT:    [[TMP116:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @ClosestHit.resume.0)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP116]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]), !continuation.registercount [[META17]], !waitmask [[META22]], !continuation.returnedRegistercount [[META17]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]), !continuation.registercount [[META17]], !waitmask [[META19]], !continuation.returnedRegistercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @ClosestHit.resume.0(
-; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META25]] !continuation.registercount [[META17]] !continuation [[META26]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META24]] !continuation.registercount [[META17]] !continuation [[META25]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 120)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -910,60 +958,111 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP3]], 27
 ; CLEANUP-NEXT:    [[TMP23:%.*]] = extractvalue [30 x i32] [[TMP3]], 28
 ; CLEANUP-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP3]], 29
+; CLEANUP-NEXT:    [[TMP4:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; CLEANUP-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 0
+; CLEANUP-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 1
+; CLEANUP-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 2
+; CLEANUP-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 3
+; CLEANUP-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 4
+; CLEANUP-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 5
+; CLEANUP-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 6
+; CLEANUP-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 7
+; CLEANUP-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 8
+; CLEANUP-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 9
+; CLEANUP-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 10
+; CLEANUP-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 11
+; CLEANUP-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 12
+; CLEANUP-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 13
+; CLEANUP-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 14
+; CLEANUP-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 15
+; CLEANUP-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 16
+; CLEANUP-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 17
+; CLEANUP-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 18
+; CLEANUP-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 19
+; CLEANUP-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 20
+; CLEANUP-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 21
+; CLEANUP-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 22
+; CLEANUP-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 23
+; CLEANUP-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 24
+; CLEANUP-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 25
+; CLEANUP-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 26
+; CLEANUP-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 27
+; CLEANUP-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 28
+; CLEANUP-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 29
+; CLEANUP-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 30
+; CLEANUP-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 31
+; CLEANUP-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 32
+; CLEANUP-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 33
+; CLEANUP-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 34
+; CLEANUP-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 35
+; CLEANUP-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 36
+; CLEANUP-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 37
+; CLEANUP-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 38
+; CLEANUP-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 39
+; CLEANUP-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 40
+; CLEANUP-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 41
+; CLEANUP-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 42
+; CLEANUP-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 43
+; CLEANUP-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 44
+; CLEANUP-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 45
+; CLEANUP-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 46
+; CLEANUP-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 47
+; CLEANUP-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 48
+; CLEANUP-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP4]], 0, 49
 ; CLEANUP-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT1]] to ptr addrspace(32)
-; CLEANUP-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4
+; CLEANUP-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4
 ; CLEANUP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 1
-; CLEANUP-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(32) [[TMP27]], align 4
+; CLEANUP-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP27]], align 4
 ; CLEANUP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 2
-; CLEANUP-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP29]], align 4
+; CLEANUP-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(32) [[TMP29]], align 4
 ; CLEANUP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 3
-; CLEANUP-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(32) [[TMP31]], align 4
+; CLEANUP-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(32) [[TMP31]], align 4
 ; CLEANUP-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 4
-; CLEANUP-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(32) [[TMP33]], align 4
+; CLEANUP-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(32) [[TMP33]], align 4
 ; CLEANUP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 5
-; CLEANUP-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(32) [[TMP35]], align 4
+; CLEANUP-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(32) [[TMP35]], align 4
 ; CLEANUP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 6
-; CLEANUP-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(32) [[TMP37]], align 4
+; CLEANUP-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(32) [[TMP37]], align 4
 ; CLEANUP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 7
-; CLEANUP-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(32) [[TMP39]], align 4
+; CLEANUP-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(32) [[TMP39]], align 4
 ; CLEANUP-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 8
-; CLEANUP-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(32) [[TMP41]], align 4
+; CLEANUP-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(32) [[TMP41]], align 4
 ; CLEANUP-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 9
-; CLEANUP-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(32) [[TMP43]], align 4
+; CLEANUP-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(32) [[TMP43]], align 4
 ; CLEANUP-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 10
-; CLEANUP-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(32) [[TMP45]], align 4
+; CLEANUP-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(32) [[TMP45]], align 4
 ; CLEANUP-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 11
-; CLEANUP-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(32) [[TMP47]], align 4
+; CLEANUP-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(32) [[TMP47]], align 4
 ; CLEANUP-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 12
-; CLEANUP-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(32) [[TMP49]], align 4
+; CLEANUP-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(32) [[TMP49]], align 4
 ; CLEANUP-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 13
-; CLEANUP-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(32) [[TMP51]], align 4
+; CLEANUP-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(32) [[TMP51]], align 4
 ; CLEANUP-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 14
-; CLEANUP-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(32) [[TMP53]], align 4
+; CLEANUP-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(32) [[TMP53]], align 4
 ; CLEANUP-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 15
-; CLEANUP-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(32) [[TMP55]], align 4
+; CLEANUP-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(32) [[TMP55]], align 4
 ; CLEANUP-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 16
-; CLEANUP-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(32) [[TMP57]], align 4
+; CLEANUP-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(32) [[TMP57]], align 4
 ; CLEANUP-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 17
-; CLEANUP-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(32) [[TMP59]], align 4
+; CLEANUP-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(32) [[TMP59]], align 4
 ; CLEANUP-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 18
-; CLEANUP-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(32) [[TMP61]], align 4
+; CLEANUP-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(32) [[TMP61]], align 4
 ; CLEANUP-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 19
-; CLEANUP-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(32) [[TMP63]], align 4
+; CLEANUP-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(32) [[TMP63]], align 4
 ; CLEANUP-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 20
-; CLEANUP-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(32) [[TMP65]], align 4
+; CLEANUP-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(32) [[TMP65]], align 4
 ; CLEANUP-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 21
-; CLEANUP-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(32) [[TMP67]], align 4
+; CLEANUP-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(32) [[TMP67]], align 4
 ; CLEANUP-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 22
-; CLEANUP-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(32) [[TMP69]], align 4
+; CLEANUP-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(32) [[TMP69]], align 4
 ; CLEANUP-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 23
-; CLEANUP-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(32) [[TMP71]], align 4
+; CLEANUP-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(32) [[TMP71]], align 4
 ; CLEANUP-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 24
-; CLEANUP-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(32) [[TMP73]], align 4
+; CLEANUP-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(32) [[TMP73]], align 4
 ; CLEANUP-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 25
-; CLEANUP-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(32) [[TMP75]], align 4
+; CLEANUP-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(32) [[TMP75]], align 4
 ; CLEANUP-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 26
-; CLEANUP-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(32) [[TMP77]], align 4
+; CLEANUP-NEXT:    [[TMP62:%.*]] = load i32, ptr addrspace(32) [[TMP77]], align 4
 ; CLEANUP-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT1]] to ptr addrspace(32)
 ; CLEANUP-NEXT:    [[TMP58:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP1]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP58]], 0
@@ -973,59 +1072,59 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
 ; CLEANUP-NEXT:    [[TMP81:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]] to ptr addrspace(32)
-; CLEANUP-NEXT:    store i32 [[TMP4]], ptr addrspace(32) [[TMP81]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP6]], ptr addrspace(32) [[TMP81]], align 4
 ; CLEANUP-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 1
-; CLEANUP-NEXT:    store i32 [[TMP6]], ptr addrspace(32) [[TMP106]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP8]], ptr addrspace(32) [[TMP106]], align 4
 ; CLEANUP-NEXT:    [[TMP107:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 2
-; CLEANUP-NEXT:    store i32 [[TMP8]], ptr addrspace(32) [[TMP107]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP10]], ptr addrspace(32) [[TMP107]], align 4
 ; CLEANUP-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 3
-; CLEANUP-NEXT:    store i32 [[TMP10]], ptr addrspace(32) [[TMP82]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP12]], ptr addrspace(32) [[TMP82]], align 4
 ; CLEANUP-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 4
-; CLEANUP-NEXT:    store i32 [[TMP12]], ptr addrspace(32) [[TMP83]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP14]], ptr addrspace(32) [[TMP83]], align 4
 ; CLEANUP-NEXT:    [[TMP84:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 5
-; CLEANUP-NEXT:    store i32 [[TMP14]], ptr addrspace(32) [[TMP84]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP16]], ptr addrspace(32) [[TMP84]], align 4
 ; CLEANUP-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 6
-; CLEANUP-NEXT:    store i32 [[TMP16]], ptr addrspace(32) [[TMP85]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP18]], ptr addrspace(32) [[TMP85]], align 4
 ; CLEANUP-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 7
-; CLEANUP-NEXT:    store i32 [[TMP18]], ptr addrspace(32) [[TMP86]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP20]], ptr addrspace(32) [[TMP86]], align 4
 ; CLEANUP-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 8
-; CLEANUP-NEXT:    store i32 [[TMP20]], ptr addrspace(32) [[TMP87]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP22]], ptr addrspace(32) [[TMP87]], align 4
 ; CLEANUP-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 9
-; CLEANUP-NEXT:    store i32 [[TMP22]], ptr addrspace(32) [[TMP88]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP24]], ptr addrspace(32) [[TMP88]], align 4
 ; CLEANUP-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 10
-; CLEANUP-NEXT:    store i32 [[TMP24]], ptr addrspace(32) [[TMP89]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP26]], ptr addrspace(32) [[TMP89]], align 4
 ; CLEANUP-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 11
-; CLEANUP-NEXT:    store i32 [[TMP26]], ptr addrspace(32) [[TMP90]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP28]], ptr addrspace(32) [[TMP90]], align 4
 ; CLEANUP-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 12
-; CLEANUP-NEXT:    store i32 [[TMP28]], ptr addrspace(32) [[TMP91]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP30]], ptr addrspace(32) [[TMP91]], align 4
 ; CLEANUP-NEXT:    [[TMP92:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 13
-; CLEANUP-NEXT:    store i32 [[TMP30]], ptr addrspace(32) [[TMP92]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP32]], ptr addrspace(32) [[TMP92]], align 4
 ; CLEANUP-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 14
-; CLEANUP-NEXT:    store i32 [[TMP32]], ptr addrspace(32) [[TMP93]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP34]], ptr addrspace(32) [[TMP93]], align 4
 ; CLEANUP-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 15
-; CLEANUP-NEXT:    store i32 [[TMP34]], ptr addrspace(32) [[TMP94]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP36]], ptr addrspace(32) [[TMP94]], align 4
 ; CLEANUP-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 16
-; CLEANUP-NEXT:    store i32 [[TMP36]], ptr addrspace(32) [[TMP95]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP38]], ptr addrspace(32) [[TMP95]], align 4
 ; CLEANUP-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 17
-; CLEANUP-NEXT:    store i32 [[TMP38]], ptr addrspace(32) [[TMP96]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP40]], ptr addrspace(32) [[TMP96]], align 4
 ; CLEANUP-NEXT:    [[TMP97:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 18
-; CLEANUP-NEXT:    store i32 [[TMP40]], ptr addrspace(32) [[TMP97]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP42]], ptr addrspace(32) [[TMP97]], align 4
 ; CLEANUP-NEXT:    [[TMP98:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 19
-; CLEANUP-NEXT:    store i32 [[TMP42]], ptr addrspace(32) [[TMP98]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP44]], ptr addrspace(32) [[TMP98]], align 4
 ; CLEANUP-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 20
-; CLEANUP-NEXT:    store i32 [[TMP44]], ptr addrspace(32) [[TMP99]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP46]], ptr addrspace(32) [[TMP99]], align 4
 ; CLEANUP-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 21
-; CLEANUP-NEXT:    store i32 [[TMP46]], ptr addrspace(32) [[TMP100]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP48]], ptr addrspace(32) [[TMP100]], align 4
 ; CLEANUP-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 22
-; CLEANUP-NEXT:    store i32 [[TMP48]], ptr addrspace(32) [[TMP101]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP50]], ptr addrspace(32) [[TMP101]], align 4
 ; CLEANUP-NEXT:    [[TMP102:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 23
-; CLEANUP-NEXT:    store i32 [[TMP50]], ptr addrspace(32) [[TMP102]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP52]], ptr addrspace(32) [[TMP102]], align 4
 ; CLEANUP-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 24
-; CLEANUP-NEXT:    store i32 [[TMP52]], ptr addrspace(32) [[TMP103]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP54]], ptr addrspace(32) [[TMP103]], align 4
 ; CLEANUP-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 25
-; CLEANUP-NEXT:    store i32 [[TMP54]], ptr addrspace(32) [[TMP104]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP56]], ptr addrspace(32) [[TMP104]], align 4
 ; CLEANUP-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 26
-; CLEANUP-NEXT:    store i32 [[TMP56]], ptr addrspace(32) [[TMP105]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP62]], ptr addrspace(32) [[TMP105]], align 4
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT1:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0
 ; CLEANUP-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT1]], i32 [[DOTFCA_1_EXTRACT]], 1
@@ -1058,7 +1157,7 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[TMP23]], 28
 ; CLEANUP-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 120)
-; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -1274,113 +1373,164 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[TMP24:%.*]] = extractvalue [30 x i32] [[TMP4]], 27
 ; POST-PROCESS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP4]], 28
 ; POST-PROCESS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP4]], 29
-; POST-PROCESS-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; POST-PROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(21) [[TMP6]], align 4
+; POST-PROCESS-NEXT:    [[TMP5:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 0
+; POST-PROCESS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 1
+; POST-PROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 2
+; POST-PROCESS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 3
+; POST-PROCESS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 4
+; POST-PROCESS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 5
+; POST-PROCESS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 6
+; POST-PROCESS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 7
+; POST-PROCESS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 8
+; POST-PROCESS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 9
+; POST-PROCESS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 10
+; POST-PROCESS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 11
+; POST-PROCESS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 12
+; POST-PROCESS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 13
+; POST-PROCESS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 14
+; POST-PROCESS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 15
+; POST-PROCESS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 16
+; POST-PROCESS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 17
+; POST-PROCESS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 18
+; POST-PROCESS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 19
+; POST-PROCESS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 20
+; POST-PROCESS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 21
+; POST-PROCESS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 22
+; POST-PROCESS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 23
+; POST-PROCESS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 24
+; POST-PROCESS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 25
+; POST-PROCESS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 26
+; POST-PROCESS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 27
+; POST-PROCESS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 28
+; POST-PROCESS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 29
+; POST-PROCESS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 30
+; POST-PROCESS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 31
+; POST-PROCESS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 32
+; POST-PROCESS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 33
+; POST-PROCESS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 34
+; POST-PROCESS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 35
+; POST-PROCESS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 36
+; POST-PROCESS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 37
+; POST-PROCESS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 38
+; POST-PROCESS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 39
+; POST-PROCESS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 40
+; POST-PROCESS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 41
+; POST-PROCESS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 42
+; POST-PROCESS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 43
+; POST-PROCESS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 44
+; POST-PROCESS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 45
+; POST-PROCESS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 46
+; POST-PROCESS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 47
+; POST-PROCESS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 48
+; POST-PROCESS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 49
+; POST-PROCESS-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP6]], i32 0
+; POST-PROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(21) [[TMP7]], align 4
 ; POST-PROCESS-NEXT:    [[TMP30:%.*]] = add i32 [[TMP3]], 4
-; POST-PROCESS-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0
-; POST-PROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(21) [[TMP10]], align 4
+; POST-PROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP10]], i32 0
+; POST-PROCESS-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(21) [[TMP11]], align 4
 ; POST-PROCESS-NEXT:    [[TMP34:%.*]] = add i32 [[TMP3]], 8
-; POST-PROCESS-NEXT:    [[TMP13:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP13]], i32 0
-; POST-PROCESS-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(21) [[TMP14]], align 4
+; POST-PROCESS-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP14]], i32 0
+; POST-PROCESS-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(21) [[TMP15]], align 4
 ; POST-PROCESS-NEXT:    [[TMP38:%.*]] = add i32 [[TMP3]], 12
-; POST-PROCESS-NEXT:    [[TMP17:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP17]], i32 0
-; POST-PROCESS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(21) [[TMP18]], align 4
+; POST-PROCESS-NEXT:    [[TMP18:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP18]], i32 0
+; POST-PROCESS-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(21) [[TMP19]], align 4
 ; POST-PROCESS-NEXT:    [[TMP42:%.*]] = add i32 [[TMP3]], 16
-; POST-PROCESS-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP21]], i32 0
-; POST-PROCESS-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4
+; POST-PROCESS-NEXT:    [[TMP25:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP25]], i32 0
+; POST-PROCESS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP29]], align 4
 ; POST-PROCESS-NEXT:    [[TMP46:%.*]] = add i32 [[TMP3]], 20
-; POST-PROCESS-NEXT:    [[TMP25:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP25]], i32 0
-; POST-PROCESS-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(21) [[TMP26]], align 4
+; POST-PROCESS-NEXT:    [[TMP26:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP26]], i32 0
+; POST-PROCESS-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(21) [[TMP37]], align 4
 ; POST-PROCESS-NEXT:    [[TMP50:%.*]] = add i32 [[TMP3]], 24
-; POST-PROCESS-NEXT:    [[TMP29:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP29]], i32 0
-; POST-PROCESS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4
+; POST-PROCESS-NEXT:    [[TMP41:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP41]], i32 0
+; POST-PROCESS-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(21) [[TMP31]], align 4
 ; POST-PROCESS-NEXT:    [[TMP54:%.*]] = add i32 [[TMP3]], 28
-; POST-PROCESS-NEXT:    [[TMP33:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP33]], i32 0
-; POST-PROCESS-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4
+; POST-PROCESS-NEXT:    [[TMP45:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP45]], i32 0
+; POST-PROCESS-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(21) [[TMP35]], align 4
 ; POST-PROCESS-NEXT:    [[TMP58:%.*]] = add i32 [[TMP3]], 32
-; POST-PROCESS-NEXT:    [[TMP37:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP37]], i32 0
-; POST-PROCESS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4
+; POST-PROCESS-NEXT:    [[TMP49:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP49]], i32 0
+; POST-PROCESS-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(21) [[TMP39]], align 4
 ; POST-PROCESS-NEXT:    [[TMP62:%.*]] = add i32 [[TMP3]], 36
-; POST-PROCESS-NEXT:    [[TMP41:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP41]], i32 0
-; POST-PROCESS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4
+; POST-PROCESS-NEXT:    [[TMP53:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP53]], i32 0
+; POST-PROCESS-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(21) [[TMP43]], align 4
 ; POST-PROCESS-NEXT:    [[TMP66:%.*]] = add i32 [[TMP3]], 40
-; POST-PROCESS-NEXT:    [[TMP45:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP45]], i32 0
-; POST-PROCESS-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4
+; POST-PROCESS-NEXT:    [[TMP57:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP57]], i32 0
+; POST-PROCESS-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(21) [[TMP47]], align 4
 ; POST-PROCESS-NEXT:    [[TMP70:%.*]] = add i32 [[TMP3]], 44
-; POST-PROCESS-NEXT:    [[TMP49:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP49]], i32 0
-; POST-PROCESS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4
+; POST-PROCESS-NEXT:    [[TMP61:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP61]], i32 0
+; POST-PROCESS-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(21) [[TMP51]], align 4
 ; POST-PROCESS-NEXT:    [[TMP74:%.*]] = add i32 [[TMP3]], 48
-; POST-PROCESS-NEXT:    [[TMP53:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP53]], i32 0
-; POST-PROCESS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4
+; POST-PROCESS-NEXT:    [[TMP65:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP65]], i32 0
+; POST-PROCESS-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(21) [[TMP55]], align 4
 ; POST-PROCESS-NEXT:    [[TMP78:%.*]] = add i32 [[TMP3]], 52
-; POST-PROCESS-NEXT:    [[TMP57:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP57]], i32 0
-; POST-PROCESS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4
+; POST-PROCESS-NEXT:    [[TMP69:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP69]], i32 0
+; POST-PROCESS-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(21) [[TMP59]], align 4
 ; POST-PROCESS-NEXT:    [[TMP82:%.*]] = add i32 [[TMP3]], 56
-; POST-PROCESS-NEXT:    [[TMP61:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP61]], i32 0
-; POST-PROCESS-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4
+; POST-PROCESS-NEXT:    [[TMP73:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP73]], i32 0
+; POST-PROCESS-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(21) [[TMP63]], align 4
 ; POST-PROCESS-NEXT:    [[TMP86:%.*]] = add i32 [[TMP3]], 60
-; POST-PROCESS-NEXT:    [[TMP65:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP65]], i32 0
-; POST-PROCESS-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4
+; POST-PROCESS-NEXT:    [[TMP77:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP77]], i32 0
+; POST-PROCESS-NEXT:    [[TMP68:%.*]] = load i32, ptr addrspace(21) [[TMP67]], align 4
 ; POST-PROCESS-NEXT:    [[TMP90:%.*]] = add i32 [[TMP3]], 64
-; POST-PROCESS-NEXT:    [[TMP69:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP69]], i32 0
-; POST-PROCESS-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4
+; POST-PROCESS-NEXT:    [[TMP81:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP81]], i32 0
+; POST-PROCESS-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(21) [[TMP71]], align 4
 ; POST-PROCESS-NEXT:    [[TMP94:%.*]] = add i32 [[TMP3]], 68
-; POST-PROCESS-NEXT:    [[TMP73:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP73]], i32 0
-; POST-PROCESS-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4
+; POST-PROCESS-NEXT:    [[TMP85:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP85]], i32 0
+; POST-PROCESS-NEXT:    [[TMP76:%.*]] = load i32, ptr addrspace(21) [[TMP75]], align 4
 ; POST-PROCESS-NEXT:    [[TMP98:%.*]] = add i32 [[TMP3]], 72
-; POST-PROCESS-NEXT:    [[TMP77:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP77]], i32 0
-; POST-PROCESS-NEXT:    [[TMP79:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4
+; POST-PROCESS-NEXT:    [[TMP89:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP89]], i32 0
+; POST-PROCESS-NEXT:    [[TMP80:%.*]] = load i32, ptr addrspace(21) [[TMP79]], align 4
 ; POST-PROCESS-NEXT:    [[TMP102:%.*]] = add i32 [[TMP3]], 76
-; POST-PROCESS-NEXT:    [[TMP81:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP81]], i32 0
-; POST-PROCESS-NEXT:    [[TMP83:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4
+; POST-PROCESS-NEXT:    [[TMP93:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP93]], i32 0
+; POST-PROCESS-NEXT:    [[TMP84:%.*]] = load i32, ptr addrspace(21) [[TMP83]], align 4
 ; POST-PROCESS-NEXT:    [[TMP106:%.*]] = add i32 [[TMP3]], 80
-; POST-PROCESS-NEXT:    [[TMP85:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP85]], i32 0
-; POST-PROCESS-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4
+; POST-PROCESS-NEXT:    [[TMP97:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP97]], i32 0
+; POST-PROCESS-NEXT:    [[TMP88:%.*]] = load i32, ptr addrspace(21) [[TMP87]], align 4
 ; POST-PROCESS-NEXT:    [[TMP110:%.*]] = add i32 [[TMP3]], 84
-; POST-PROCESS-NEXT:    [[TMP89:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP89]], i32 0
-; POST-PROCESS-NEXT:    [[TMP91:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4
+; POST-PROCESS-NEXT:    [[TMP101:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP91:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP101]], i32 0
+; POST-PROCESS-NEXT:    [[TMP92:%.*]] = load i32, ptr addrspace(21) [[TMP91]], align 4
 ; POST-PROCESS-NEXT:    [[TMP114:%.*]] = add i32 [[TMP3]], 88
-; POST-PROCESS-NEXT:    [[TMP93:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP93]], i32 0
-; POST-PROCESS-NEXT:    [[TMP95:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4
+; POST-PROCESS-NEXT:    [[TMP105:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP105]], i32 0
+; POST-PROCESS-NEXT:    [[TMP96:%.*]] = load i32, ptr addrspace(21) [[TMP95]], align 4
 ; POST-PROCESS-NEXT:    [[TMP118:%.*]] = add i32 [[TMP3]], 92
-; POST-PROCESS-NEXT:    [[TMP97:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP97]], i32 0
-; POST-PROCESS-NEXT:    [[TMP99:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4
+; POST-PROCESS-NEXT:    [[TMP109:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP99:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP109]], i32 0
+; POST-PROCESS-NEXT:    [[TMP100:%.*]] = load i32, ptr addrspace(21) [[TMP99]], align 4
 ; POST-PROCESS-NEXT:    [[TMP122:%.*]] = add i32 [[TMP3]], 96
-; POST-PROCESS-NEXT:    [[TMP101:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP115:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP101]], i32 0
-; POST-PROCESS-NEXT:    [[TMP103:%.*]] = load i32, ptr addrspace(21) [[TMP115]], align 4
+; POST-PROCESS-NEXT:    [[TMP115:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP103:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP115]], i32 0
+; POST-PROCESS-NEXT:    [[TMP104:%.*]] = load i32, ptr addrspace(21) [[TMP103]], align 4
 ; POST-PROCESS-NEXT:    [[TMP126:%.*]] = add i32 [[TMP3]], 100
-; POST-PROCESS-NEXT:    [[TMP105:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP116:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP105]], i32 0
-; POST-PROCESS-NEXT:    [[TMP107:%.*]] = load i32, ptr addrspace(21) [[TMP116]], align 4
+; POST-PROCESS-NEXT:    [[TMP116:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP107:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP116]], i32 0
+; POST-PROCESS-NEXT:    [[TMP108:%.*]] = load i32, ptr addrspace(21) [[TMP107]], align 4
 ; POST-PROCESS-NEXT:    [[TMP130:%.*]] = add i32 [[TMP3]], 104
-; POST-PROCESS-NEXT:    [[TMP109:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP117:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP109]], i32 0
-; POST-PROCESS-NEXT:    [[TMP111:%.*]] = load i32, ptr addrspace(21) [[TMP117]], align 4
+; POST-PROCESS-NEXT:    [[TMP117:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP111:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP117]], i32 0
+; POST-PROCESS-NEXT:    [[TMP120:%.*]] = load i32, ptr addrspace(21) [[TMP111]], align 4
 ; POST-PROCESS-NEXT:    [[TMP119:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP1]], 0
 ; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP119]], 0
 ; POST-PROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -1388,8 +1538,6 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[TMP113:%.*]] = add i32 [[TMP112]], -108
 ; POST-PROCESS-NEXT:    store i32 [[TMP113]], ptr [[CSP]], align 4
 ; POST-PROCESS-NEXT:    ret void
-; POST-PROCESS:       entryresume.0.split:
-; POST-PROCESS-NEXT:    unreachable
 ;
 ;
 ; POST-PROCESS-LABEL: define void @AnyHit(
@@ -2114,150 +2262,201 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[TMP24:%.*]] = extractvalue [30 x i32] [[TMP4]], 27
 ; POST-PROCESS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP4]], 28
 ; POST-PROCESS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP4]], 29
-; POST-PROCESS-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0
-; POST-PROCESS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP6]], align 4
+; POST-PROCESS-NEXT:    [[TMP5:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 0
+; POST-PROCESS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 1
+; POST-PROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 2
+; POST-PROCESS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 3
+; POST-PROCESS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 4
+; POST-PROCESS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 5
+; POST-PROCESS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 6
+; POST-PROCESS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 7
+; POST-PROCESS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 8
+; POST-PROCESS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 9
+; POST-PROCESS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 10
+; POST-PROCESS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 11
+; POST-PROCESS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 12
+; POST-PROCESS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 13
+; POST-PROCESS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 14
+; POST-PROCESS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 15
+; POST-PROCESS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 16
+; POST-PROCESS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 17
+; POST-PROCESS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 18
+; POST-PROCESS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 19
+; POST-PROCESS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 20
+; POST-PROCESS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 21
+; POST-PROCESS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 22
+; POST-PROCESS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 23
+; POST-PROCESS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 24
+; POST-PROCESS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 25
+; POST-PROCESS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 26
+; POST-PROCESS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 27
+; POST-PROCESS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 28
+; POST-PROCESS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 29
+; POST-PROCESS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 30
+; POST-PROCESS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 31
+; POST-PROCESS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 32
+; POST-PROCESS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 33
+; POST-PROCESS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 34
+; POST-PROCESS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 35
+; POST-PROCESS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 36
+; POST-PROCESS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 37
+; POST-PROCESS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 38
+; POST-PROCESS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 39
+; POST-PROCESS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 40
+; POST-PROCESS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 41
+; POST-PROCESS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 42
+; POST-PROCESS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 43
+; POST-PROCESS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 44
+; POST-PROCESS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 45
+; POST-PROCESS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 46
+; POST-PROCESS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 47
+; POST-PROCESS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 48
+; POST-PROCESS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP5]], 0, 49
+; POST-PROCESS-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP6]], i32 0
+; POST-PROCESS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP7]], align 4
 ; POST-PROCESS-NEXT:    [[TMP30:%.*]] = add i32 [[TMP3]], 4
-; POST-PROCESS-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0
-; POST-PROCESS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP10]], align 4
+; POST-PROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP10]], i32 0
+; POST-PROCESS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP11]], align 4
 ; POST-PROCESS-NEXT:    [[TMP34:%.*]] = add i32 [[TMP3]], 8
-; POST-PROCESS-NEXT:    [[TMP13:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP13]], i32 0
-; POST-PROCESS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP14]], align 4
+; POST-PROCESS-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP14]], i32 0
+; POST-PROCESS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP15]], align 4
 ; POST-PROCESS-NEXT:    [[TMP38:%.*]] = add i32 [[TMP3]], 12
-; POST-PROCESS-NEXT:    [[TMP17:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP17]], i32 0
-; POST-PROCESS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP18]], align 4
+; POST-PROCESS-NEXT:    [[TMP18:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP18]], i32 0
+; POST-PROCESS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP19]], align 4
 ; POST-PROCESS-NEXT:    [[TMP42:%.*]] = add i32 [[TMP3]], 16
-; POST-PROCESS-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP21]], i32 0
+; POST-PROCESS-NEXT:    [[TMP25:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP25]], i32 0
 ; POST-PROCESS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4
 ; POST-PROCESS-NEXT:    [[TMP46:%.*]] = add i32 [[TMP3]], 20
-; POST-PROCESS-NEXT:    [[TMP25:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP25]], i32 0
-; POST-PROCESS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP26]], align 4
+; POST-PROCESS-NEXT:    [[TMP26:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP26]], i32 0
+; POST-PROCESS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4
 ; POST-PROCESS-NEXT:    [[TMP50:%.*]] = add i32 [[TMP3]], 24
-; POST-PROCESS-NEXT:    [[TMP31:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0
-; POST-PROCESS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4
+; POST-PROCESS-NEXT:    [[TMP36:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0
+; POST-PROCESS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP31]], align 4
 ; POST-PROCESS-NEXT:    [[TMP54:%.*]] = add i32 [[TMP3]], 28
-; POST-PROCESS-NEXT:    [[TMP35:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP35]], i32 0
-; POST-PROCESS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP36]], align 4
+; POST-PROCESS-NEXT:    [[TMP40:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0
+; POST-PROCESS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP35]], align 4
 ; POST-PROCESS-NEXT:    [[TMP58:%.*]] = add i32 [[TMP3]], 32
-; POST-PROCESS-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0
-; POST-PROCESS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4
+; POST-PROCESS-NEXT:    [[TMP44:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP44]], i32 0
+; POST-PROCESS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP39]], align 4
 ; POST-PROCESS-NEXT:    [[TMP62:%.*]] = add i32 [[TMP3]], 36
-; POST-PROCESS-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0
-; POST-PROCESS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4
+; POST-PROCESS-NEXT:    [[TMP48:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0
+; POST-PROCESS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP43]], align 4
 ; POST-PROCESS-NEXT:    [[TMP66:%.*]] = add i32 [[TMP3]], 40
-; POST-PROCESS-NEXT:    [[TMP47:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP47]], i32 0
-; POST-PROCESS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4
+; POST-PROCESS-NEXT:    [[TMP52:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0
+; POST-PROCESS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP47]], align 4
 ; POST-PROCESS-NEXT:    [[TMP70:%.*]] = add i32 [[TMP3]], 44
-; POST-PROCESS-NEXT:    [[TMP51:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0
-; POST-PROCESS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4
+; POST-PROCESS-NEXT:    [[TMP56:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP56]], i32 0
+; POST-PROCESS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP51]], align 4
 ; POST-PROCESS-NEXT:    [[TMP74:%.*]] = add i32 [[TMP3]], 48
-; POST-PROCESS-NEXT:    [[TMP55:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0
-; POST-PROCESS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4
+; POST-PROCESS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0
+; POST-PROCESS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP55]], align 4
 ; POST-PROCESS-NEXT:    [[TMP78:%.*]] = add i32 [[TMP3]], 52
-; POST-PROCESS-NEXT:    [[TMP59:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP59]], i32 0
-; POST-PROCESS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4
+; POST-PROCESS-NEXT:    [[TMP64:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0
+; POST-PROCESS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP59]], align 4
 ; POST-PROCESS-NEXT:    [[TMP82:%.*]] = add i32 [[TMP3]], 56
-; POST-PROCESS-NEXT:    [[TMP63:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0
-; POST-PROCESS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4
+; POST-PROCESS-NEXT:    [[TMP68:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP68]], i32 0
+; POST-PROCESS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP63]], align 4
 ; POST-PROCESS-NEXT:    [[TMP86:%.*]] = add i32 [[TMP3]], 60
-; POST-PROCESS-NEXT:    [[TMP67:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0
-; POST-PROCESS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4
+; POST-PROCESS-NEXT:    [[TMP72:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0
+; POST-PROCESS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP67]], align 4
 ; POST-PROCESS-NEXT:    [[TMP90:%.*]] = add i32 [[TMP3]], 64
-; POST-PROCESS-NEXT:    [[TMP71:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP71]], i32 0
-; POST-PROCESS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4
+; POST-PROCESS-NEXT:    [[TMP76:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0
+; POST-PROCESS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP71]], align 4
 ; POST-PROCESS-NEXT:    [[TMP94:%.*]] = add i32 [[TMP3]], 68
-; POST-PROCESS-NEXT:    [[TMP75:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0
-; POST-PROCESS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4
+; POST-PROCESS-NEXT:    [[TMP80:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP80]], i32 0
+; POST-PROCESS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP75]], align 4
 ; POST-PROCESS-NEXT:    [[TMP98:%.*]] = add i32 [[TMP3]], 72
-; POST-PROCESS-NEXT:    [[TMP79:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0
-; POST-PROCESS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4
+; POST-PROCESS-NEXT:    [[TMP84:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0
+; POST-PROCESS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP79]], align 4
 ; POST-PROCESS-NEXT:    [[TMP102:%.*]] = add i32 [[TMP3]], 76
-; POST-PROCESS-NEXT:    [[TMP83:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP83]], i32 0
-; POST-PROCESS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4
+; POST-PROCESS-NEXT:    [[TMP88:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0
+; POST-PROCESS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP83]], align 4
 ; POST-PROCESS-NEXT:    [[TMP106:%.*]] = add i32 [[TMP3]], 80
-; POST-PROCESS-NEXT:    [[TMP87:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0
-; POST-PROCESS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4
+; POST-PROCESS-NEXT:    [[TMP92:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP92]], i32 0
+; POST-PROCESS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP87]], align 4
 ; POST-PROCESS-NEXT:    [[TMP110:%.*]] = add i32 [[TMP3]], 84
-; POST-PROCESS-NEXT:    [[TMP91:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP91]], i32 0
-; POST-PROCESS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4
+; POST-PROCESS-NEXT:    [[TMP96:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP91:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP96]], i32 0
+; POST-PROCESS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP91]], align 4
 ; POST-PROCESS-NEXT:    [[TMP114:%.*]] = add i32 [[TMP3]], 88
-; POST-PROCESS-NEXT:    [[TMP95:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP95]], i32 0
-; POST-PROCESS-NEXT:    [[TMP117:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4
+; POST-PROCESS-NEXT:    [[TMP100:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP100]], i32 0
+; POST-PROCESS-NEXT:    [[TMP117:%.*]] = load i32, ptr addrspace(21) [[TMP95]], align 4
 ; POST-PROCESS-NEXT:    [[TMP118:%.*]] = add i32 [[TMP3]], 92
-; POST-PROCESS-NEXT:    [[TMP99:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP99]], i32 0
-; POST-PROCESS-NEXT:    [[TMP121:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4
+; POST-PROCESS-NEXT:    [[TMP104:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP99:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP104]], i32 0
+; POST-PROCESS-NEXT:    [[TMP121:%.*]] = load i32, ptr addrspace(21) [[TMP99]], align 4
 ; POST-PROCESS-NEXT:    [[TMP122:%.*]] = add i32 [[TMP3]], 96
-; POST-PROCESS-NEXT:    [[TMP103:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP103]], i32 0
-; POST-PROCESS-NEXT:    [[TMP125:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4
+; POST-PROCESS-NEXT:    [[TMP108:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP103:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP108]], i32 0
+; POST-PROCESS-NEXT:    [[TMP125:%.*]] = load i32, ptr addrspace(21) [[TMP103]], align 4
 ; POST-PROCESS-NEXT:    [[TMP126:%.*]] = add i32 [[TMP3]], 100
-; POST-PROCESS-NEXT:    [[TMP107:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP107]], i32 0
-; POST-PROCESS-NEXT:    [[TMP129:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4
+; POST-PROCESS-NEXT:    [[TMP119:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP107:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP119]], i32 0
+; POST-PROCESS-NEXT:    [[TMP129:%.*]] = load i32, ptr addrspace(21) [[TMP107]], align 4
 ; POST-PROCESS-NEXT:    [[TMP130:%.*]] = add i32 [[TMP3]], 104
-; POST-PROCESS-NEXT:    [[TMP111:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP116:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP111]], i32 0
-; POST-PROCESS-NEXT:    [[TMP133:%.*]] = load i32, ptr addrspace(21) [[TMP116]], align 4
+; POST-PROCESS-NEXT:    [[TMP124:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP111:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP124]], i32 0
+; POST-PROCESS-NEXT:    [[TMP133:%.*]] = load i32, ptr addrspace(21) [[TMP111]], align 4
 ; POST-PROCESS-NEXT:    [[TMP112:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP1]], 0
 ; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP112]], 0
 ; POST-PROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POST-PROCESS-NEXT:    [[TMP135:%.*]] = add i32 [[TMP2]], 116
-; POST-PROCESS-NEXT:    [[TMP120:%.*]] = inttoptr i32 [[TMP135]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP115:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP120]], i32 0
-; POST-PROCESS-NEXT:    [[TMP141:%.*]] = load i32, ptr addrspace(21) [[TMP115]], align 4
+; POST-PROCESS-NEXT:    [[TMP115:%.*]] = inttoptr i32 [[TMP135]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP116:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP115]], i32 0
+; POST-PROCESS-NEXT:    [[TMP141:%.*]] = load i32, ptr addrspace(21) [[TMP116]], align 4
 ; POST-PROCESS-NEXT:    [[TMP138:%.*]] = add i32 [[TMP2]], 108
-; POST-PROCESS-NEXT:    [[TMP123:%.*]] = inttoptr i32 [[TMP138]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP123]], i32 0
+; POST-PROCESS-NEXT:    [[TMP127:%.*]] = inttoptr i32 [[TMP138]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP127]], i32 0
 ; POST-PROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP140]], align 4
-; POST-PROCESS-NEXT:    [[TMP119:%.*]] = inttoptr i32 [[TMP141]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP119]], i32 0
+; POST-PROCESS-NEXT:    [[TMP120:%.*]] = inttoptr i32 [[TMP141]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP120]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP29]], ptr addrspace(21) [[TMP143]], align 4
 ; POST-PROCESS-NEXT:    [[TMP144:%.*]] = add i32 [[TMP141]], 4
-; POST-PROCESS-NEXT:    [[TMP124:%.*]] = inttoptr i32 [[TMP144]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP124]], i32 0
+; POST-PROCESS-NEXT:    [[TMP123:%.*]] = inttoptr i32 [[TMP144]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP123]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP33]], ptr addrspace(21) [[TMP146]], align 4
 ; POST-PROCESS-NEXT:    [[TMP147:%.*]] = add i32 [[TMP141]], 8
-; POST-PROCESS-NEXT:    [[TMP127:%.*]] = inttoptr i32 [[TMP147]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP127]], i32 0
+; POST-PROCESS-NEXT:    [[TMP128:%.*]] = inttoptr i32 [[TMP147]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP128]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP37]], ptr addrspace(21) [[TMP149]], align 4
 ; POST-PROCESS-NEXT:    [[TMP150:%.*]] = add i32 [[TMP141]], 12
-; POST-PROCESS-NEXT:    [[TMP128:%.*]] = inttoptr i32 [[TMP150]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP128]], i32 0
+; POST-PROCESS-NEXT:    [[TMP131:%.*]] = inttoptr i32 [[TMP150]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP131]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP41]], ptr addrspace(21) [[TMP152]], align 4
 ; POST-PROCESS-NEXT:    [[TMP153:%.*]] = add i32 [[TMP141]], 16
-; POST-PROCESS-NEXT:    [[TMP131:%.*]] = inttoptr i32 [[TMP153]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP131]], i32 0
+; POST-PROCESS-NEXT:    [[TMP132:%.*]] = inttoptr i32 [[TMP153]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP132]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP45]], ptr addrspace(21) [[TMP155]], align 4
 ; POST-PROCESS-NEXT:    [[TMP156:%.*]] = add i32 [[TMP141]], 20
-; POST-PROCESS-NEXT:    [[TMP134:%.*]] = inttoptr i32 [[TMP156]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP134]], i32 0
+; POST-PROCESS-NEXT:    [[TMP136:%.*]] = inttoptr i32 [[TMP156]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP136]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP49]], ptr addrspace(21) [[TMP158]], align 4
 ; POST-PROCESS-NEXT:    [[TMP159:%.*]] = add i32 [[TMP141]], 24
-; POST-PROCESS-NEXT:    [[TMP137:%.*]] = inttoptr i32 [[TMP159]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP137]], i32 0
+; POST-PROCESS-NEXT:    [[TMP139:%.*]] = inttoptr i32 [[TMP159]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP139]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP53]], ptr addrspace(21) [[TMP161]], align 4
 ; POST-PROCESS-NEXT:    [[TMP162:%.*]] = add i32 [[TMP141]], 28
 ; POST-PROCESS-NEXT:    [[TMP142:%.*]] = inttoptr i32 [[TMP162]] to ptr addrspace(21)
@@ -2567,86 +2766,137 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP5]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(22) [[TMP7]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 0
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 1
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 2
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 3
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 4
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 5
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 6
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 7
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 8
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 9
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 10
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 11
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 12
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 13
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 14
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 15
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 16
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 17
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 18
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 19
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 20
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 21
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 22
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 23
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 24
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 25
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 26
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 27
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 28
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 29
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 30
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 31
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 32
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 33
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 34
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 35
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 36
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 37
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 38
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 39
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 40
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 41
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 42
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 43
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 44
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 45
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 46
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 47
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 48
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 49
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP5]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(22) [[TMP8]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP31:%.*]] = add i32 [[TMP5]], 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP31]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(22) [[TMP10]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP31]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(22) [[TMP11]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = add i32 [[TMP5]], 8
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP34]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(22) [[TMP13]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP34]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(22) [[TMP14]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP37:%.*]] = add i32 [[TMP5]], 12
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP37]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(22) [[TMP16]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP37]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(22) [[TMP17]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP40:%.*]] = add i32 [[TMP5]], 16
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP40]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(22) [[TMP19]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP40]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(22) [[TMP20]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = add i32 [[TMP5]], 20
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP43]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(22) [[TMP24]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP43]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(22) [[TMP25]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP46:%.*]] = add i32 [[TMP5]], 24
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP46]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(22) [[TMP25]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP46]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(22) [[TMP26]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP49:%.*]] = add i32 [[TMP5]], 28
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP49]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP28]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP90:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP49]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP90]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP52:%.*]] = add i32 [[TMP5]], 32
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP52]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(22) [[TMP33]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP52]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP32]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP55:%.*]] = add i32 [[TMP5]], 36
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP55]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(22) [[TMP36]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP55]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP35]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP58:%.*]] = add i32 [[TMP5]], 40
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP58]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(22) [[TMP39]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP58]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP38]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP61:%.*]] = add i32 [[TMP5]], 44
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP61]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(22) [[TMP42]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP61]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP41]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP64:%.*]] = add i32 [[TMP5]], 48
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP64]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(22) [[TMP45]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP64]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP44]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP67:%.*]] = add i32 [[TMP5]], 52
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP67]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(22) [[TMP48]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP67]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP47]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP70:%.*]] = add i32 [[TMP5]], 56
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP70]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(22) [[TMP51]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP70]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP50]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP73:%.*]] = add i32 [[TMP5]], 60
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP73]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(22) [[TMP54]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP73]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP53]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP76:%.*]] = add i32 [[TMP5]], 64
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP76]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(22) [[TMP57]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP76]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(22) [[TMP56]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP79:%.*]] = add i32 [[TMP5]], 68
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP79]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(22) [[TMP60]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP79]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(22) [[TMP59]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP82:%.*]] = add i32 [[TMP5]], 72
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP82]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = load i32, ptr addrspace(22) [[TMP63]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP82]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(22) [[TMP62]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP85:%.*]] = add i32 [[TMP5]], 76
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP85]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(22) [[TMP66]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP85]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(22) [[TMP65]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP88:%.*]] = add i32 [[TMP5]], 80
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP88]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP68:%.*]] = load i32, ptr addrspace(22) [[TMP69]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP88]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(22) [[TMP68]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP91:%.*]] = add i32 [[TMP5]], 84
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP91]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(22) [[TMP72]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP91]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(22) [[TMP71]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP94:%.*]] = add i32 [[TMP5]], 88
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP94]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP74:%.*]] = load i32, ptr addrspace(22) [[TMP75]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP94]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(22) [[TMP74]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP97:%.*]] = add i32 [[TMP5]], 92
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP97]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(22) [[TMP78]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP77:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP97]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(22) [[TMP77]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP100:%.*]] = add i32 [[TMP5]], 96
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP100]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP80:%.*]] = load i32, ptr addrspace(22) [[TMP81]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP100]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(22) [[TMP80]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP103:%.*]] = add i32 [[TMP5]], 100
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP103]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP83:%.*]] = load i32, ptr addrspace(22) [[TMP84]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP103]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP84:%.*]] = load i32, ptr addrspace(22) [[TMP83]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP106:%.*]] = add i32 [[TMP5]], 104
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP90:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP106]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP86:%.*]] = load i32, ptr addrspace(22) [[TMP90]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP86:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP106]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(22) [[TMP86]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP92:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP1]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP92]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -2654,8 +2904,6 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP89:%.*]] = add i32 [[TMP87]], -108
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP89]], ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    ret void
-; POST-PROCESS-GLOBAL:       entryresume.0.split:
-; POST-PROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; POST-PROCESS-GLOBAL-LABEL: define void @AnyHit(
@@ -3276,29 +3524,80 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP5]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP7]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 0
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 1
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 2
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 3
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 4
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 5
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 6
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 7
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 8
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 9
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 10
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 11
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 12
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 13
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 14
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 15
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 16
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 17
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 18
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 19
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 20
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 21
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 22
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 23
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 24
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 25
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 26
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 27
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 28
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 29
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 30
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 31
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 32
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 33
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 34
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 35
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 36
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 37
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 38
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 39
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 40
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 41
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 42
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 43
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 44
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 45
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 46
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 47
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 48
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 49
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP5]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP8]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP31:%.*]] = add i32 [[TMP5]], 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP31]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP10]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP31]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP11]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = add i32 [[TMP5]], 8
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP34]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP13]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP34]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP14]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP37:%.*]] = add i32 [[TMP5]], 12
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP37]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP16]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP37]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP17]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP40:%.*]] = add i32 [[TMP5]], 16
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP40]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP19]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP40]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP20]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = add i32 [[TMP5]], 20
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP43]]
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP24]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP46:%.*]] = add i32 [[TMP5]], 24
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP46]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP25]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP46]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP26]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP49:%.*]] = add i32 [[TMP5]], 28
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP49]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP28]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP49]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP89]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP52:%.*]] = add i32 [[TMP5]], 32
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP52]]
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP32]], align 4
@@ -3360,8 +3659,8 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP92]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP110:%.*]] = add i32 [[TMP4]], 116
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP110]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP114:%.*]] = load i32, ptr addrspace(22) [[TMP89]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP110]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP114:%.*]] = load i32, ptr addrspace(22) [[TMP95]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP112:%.*]] = add i32 [[TMP4]], 108
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP113:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP112]]
 ; POST-PROCESS-GLOBAL-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(22) [[TMP113]], align 4
@@ -3497,7 +3796,7 @@ attributes #3 = { nounwind }
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @main(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 108)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MAIN_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -3598,12 +3897,12 @@ attributes #3 = { nounwind }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 undef, 27
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 undef, 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 undef, 29
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !waitmask [[META22:![0-9]+]], !continuation.returnedRegistercount [[META17:![0-9]+]], !continuation.registercount [[META17]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i32 poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !waitmask [[META22:![0-9]+]], !continuation.returnedRegistercount [[META17:![0-9]+]], !continuation.registercount [[META17]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @main.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META19]] !continuation [[META20]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META19]] !continuation [[META20]] !continuation.registercount [[META17]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 108)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MAIN_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -3638,60 +3937,111 @@ attributes #3 = { nounwind }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29
+; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 3
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 4
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 5
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 6
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 7
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 8
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 9
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 10
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 11
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 12
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 13
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 14
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 15
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 16
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 17
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 18
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 19
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 20
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 21
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 22
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 23
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 24
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 25
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 26
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 27
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 28
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 29
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 30
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 31
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 32
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 33
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 34
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 35
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 36
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 37
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 38
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 39
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 40
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 41
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 42
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 43
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 44
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 45
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 46
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 47
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 48
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 49
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32)
-; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 1
-; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 2
-; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 3
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 4
-; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 5
-; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 6
-; CLEANUP-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 7
-; CLEANUP-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 8
-; CLEANUP-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 9
-; CLEANUP-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 10
-; CLEANUP-CPS-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 11
-; CLEANUP-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 12
-; CLEANUP-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 13
-; CLEANUP-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 14
-; CLEANUP-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 15
-; CLEANUP-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 16
-; CLEANUP-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 17
-; CLEANUP-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 18
-; CLEANUP-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 19
-; CLEANUP-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 20
-; CLEANUP-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 21
-; CLEANUP-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 22
-; CLEANUP-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 23
-; CLEANUP-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 24
-; CLEANUP-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 25
-; CLEANUP-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 26
-; CLEANUP-CPS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP62:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP61]], 0
@@ -3701,7 +4051,7 @@ attributes #3 = { nounwind }
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @AnyHit(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !lgc.cps [[META24:![0-9]+]] !continuation [[META25:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !lgc.cps [[META24:![0-9]+]] !continuation [[META25:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
@@ -3956,12 +4306,12 @@ attributes #3 = { nounwind }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @ClosestHit(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META26:![0-9]+]] !lgc.cps [[META23]] !continuation [[META27:![0-9]+]] !continuation.stacksize [[META21]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META26:![0-9]+]] !lgc.cps [[META23]] !continuation [[META27:![0-9]+]] !continuation.stacksize [[META28:![0-9]+]] !continuation.state [[META29:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 116)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4157,12 +4507,12 @@ attributes #3 = { nounwind }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_27_INSERT135:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT132]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT138:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT135]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT141:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT138]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP61]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]), !waitmask [[META22]], !continuation.returnedRegistercount [[META17]], !continuation.registercount [[META17]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i32 poison, i64 [[TMP61]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]), !waitmask [[META22]], !continuation.returnedRegistercount [[META17]], !continuation.registercount [[META17]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @ClosestHit.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META26]] !lgc.cps [[META23]] !continuation [[META27]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META26]] !lgc.cps [[META23]] !continuation [[META27]] !continuation.registercount [[META17]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 116)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4197,60 +4547,111 @@ attributes #3 = { nounwind }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29
+; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 0
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 1
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 2
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 3
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 4
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 5
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 6
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 7
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 8
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 9
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 10
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 11
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 12
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 13
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 14
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 15
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 16
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 17
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 18
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 19
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 20
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 21
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 22
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 23
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 24
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 25
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 26
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 27
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 28
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 29
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 30
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 31
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 32
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 33
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 34
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 35
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 36
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 37
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 38
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 39
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 40
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 41
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 42
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 43
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 44
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 45
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 46
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 47
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 48
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 49
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32)
-; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 1
-; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 2
-; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 3
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 4
-; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 5
-; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 6
-; CLEANUP-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 7
-; CLEANUP-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 8
-; CLEANUP-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 9
-; CLEANUP-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 10
-; CLEANUP-CPS-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 11
-; CLEANUP-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 12
-; CLEANUP-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 13
-; CLEANUP-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 14
-; CLEANUP-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 15
-; CLEANUP-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 16
-; CLEANUP-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 17
-; CLEANUP-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 18
-; CLEANUP-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 19
-; CLEANUP-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 20
-; CLEANUP-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 21
-; CLEANUP-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 22
-; CLEANUP-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 23
-; CLEANUP-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 24
-; CLEANUP-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 25
-; CLEANUP-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 26
-; CLEANUP-CPS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT254:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP61]], 0
@@ -4260,59 +4661,59 @@ attributes #3 = { nounwind }
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP62:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]] to ptr addrspace(32)
-; CLEANUP-CPS-NEXT:    store i32 [[TMP7]], ptr addrspace(32) [[TMP62]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP9]], ptr addrspace(32) [[TMP62]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 1
-; CLEANUP-CPS-NEXT:    store i32 [[TMP9]], ptr addrspace(32) [[TMP63]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP11]], ptr addrspace(32) [[TMP63]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 2
-; CLEANUP-CPS-NEXT:    store i32 [[TMP11]], ptr addrspace(32) [[TMP64]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP13]], ptr addrspace(32) [[TMP64]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 3
-; CLEANUP-CPS-NEXT:    store i32 [[TMP13]], ptr addrspace(32) [[TMP65]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP15]], ptr addrspace(32) [[TMP65]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 4
-; CLEANUP-CPS-NEXT:    store i32 [[TMP15]], ptr addrspace(32) [[TMP66]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP17]], ptr addrspace(32) [[TMP66]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 5
-; CLEANUP-CPS-NEXT:    store i32 [[TMP17]], ptr addrspace(32) [[TMP67]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP19]], ptr addrspace(32) [[TMP67]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 6
-; CLEANUP-CPS-NEXT:    store i32 [[TMP19]], ptr addrspace(32) [[TMP68]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP21]], ptr addrspace(32) [[TMP68]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 7
-; CLEANUP-CPS-NEXT:    store i32 [[TMP21]], ptr addrspace(32) [[TMP69]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP23]], ptr addrspace(32) [[TMP69]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 8
-; CLEANUP-CPS-NEXT:    store i32 [[TMP23]], ptr addrspace(32) [[TMP70]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP25]], ptr addrspace(32) [[TMP70]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 9
-; CLEANUP-CPS-NEXT:    store i32 [[TMP25]], ptr addrspace(32) [[TMP71]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP27]], ptr addrspace(32) [[TMP71]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 10
-; CLEANUP-CPS-NEXT:    store i32 [[TMP27]], ptr addrspace(32) [[TMP72]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP29]], ptr addrspace(32) [[TMP72]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 11
-; CLEANUP-CPS-NEXT:    store i32 [[TMP29]], ptr addrspace(32) [[TMP73]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP31]], ptr addrspace(32) [[TMP73]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 12
-; CLEANUP-CPS-NEXT:    store i32 [[TMP31]], ptr addrspace(32) [[TMP74]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP33]], ptr addrspace(32) [[TMP74]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 13
-; CLEANUP-CPS-NEXT:    store i32 [[TMP33]], ptr addrspace(32) [[TMP75]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP35]], ptr addrspace(32) [[TMP75]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 14
-; CLEANUP-CPS-NEXT:    store i32 [[TMP35]], ptr addrspace(32) [[TMP76]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP37]], ptr addrspace(32) [[TMP76]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 15
-; CLEANUP-CPS-NEXT:    store i32 [[TMP37]], ptr addrspace(32) [[TMP77]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP39]], ptr addrspace(32) [[TMP77]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 16
-; CLEANUP-CPS-NEXT:    store i32 [[TMP39]], ptr addrspace(32) [[TMP78]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP41]], ptr addrspace(32) [[TMP78]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 17
-; CLEANUP-CPS-NEXT:    store i32 [[TMP41]], ptr addrspace(32) [[TMP79]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP43]], ptr addrspace(32) [[TMP79]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 18
-; CLEANUP-CPS-NEXT:    store i32 [[TMP43]], ptr addrspace(32) [[TMP80]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP45]], ptr addrspace(32) [[TMP80]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP81:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 19
-; CLEANUP-CPS-NEXT:    store i32 [[TMP45]], ptr addrspace(32) [[TMP81]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP47]], ptr addrspace(32) [[TMP81]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 20
-; CLEANUP-CPS-NEXT:    store i32 [[TMP47]], ptr addrspace(32) [[TMP82]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP49]], ptr addrspace(32) [[TMP82]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 21
-; CLEANUP-CPS-NEXT:    store i32 [[TMP49]], ptr addrspace(32) [[TMP83]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP51]], ptr addrspace(32) [[TMP83]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP84:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 22
-; CLEANUP-CPS-NEXT:    store i32 [[TMP51]], ptr addrspace(32) [[TMP84]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP53]], ptr addrspace(32) [[TMP84]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 23
-; CLEANUP-CPS-NEXT:    store i32 [[TMP53]], ptr addrspace(32) [[TMP85]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP55]], ptr addrspace(32) [[TMP85]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 24
-; CLEANUP-CPS-NEXT:    store i32 [[TMP55]], ptr addrspace(32) [[TMP86]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP57]], ptr addrspace(32) [[TMP86]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 25
-; CLEANUP-CPS-NEXT:    store i32 [[TMP57]], ptr addrspace(32) [[TMP87]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP59]], ptr addrspace(32) [[TMP87]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 26
-; CLEANUP-CPS-NEXT:    store i32 [[TMP59]], ptr addrspace(32) [[TMP88]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[TMP89]], ptr addrspace(32) [[TMP88]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT253:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT254]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1
@@ -4345,7 +4746,7 @@ attributes #3 = { nounwind }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 116)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT253]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT253]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -4561,113 +4962,164 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29
-; POST-PROCESS-CPS-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP7]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(21) [[TMP8]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP7:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 0
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 1
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 2
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 3
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 4
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 5
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 6
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 7
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 8
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 9
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 10
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 11
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 12
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 13
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 14
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 15
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 16
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 17
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 18
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 19
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 20
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 21
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 22
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 23
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 24
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 25
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 26
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 27
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 28
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 29
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 30
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 31
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 32
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 33
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 34
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 35
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 36
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 37
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 38
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 39
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 40
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 41
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 42
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 43
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 44
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 45
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 46
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 47
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 48
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 49
+; POST-PROCESS-CPS-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP8]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(21) [[TMP9]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP10:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4
-; POST-PROCESS-CPS-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP11]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(21) [[TMP12]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP12]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(21) [[TMP13]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8
-; POST-PROCESS-CPS-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP15]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(21) [[TMP16]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP16]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(21) [[TMP17]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP18:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12
-; POST-PROCESS-CPS-NEXT:    [[TMP19:%.*]] = inttoptr i32 [[TMP18]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP19]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(21) [[TMP20]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP20:%.*]] = inttoptr i32 [[TMP18]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP20]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(21) [[TMP21]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP22:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16
-; POST-PROCESS-CPS-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP22]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP23]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(21) [[TMP24]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP24:%.*]] = inttoptr i32 [[TMP22]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP24]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(21) [[TMP25]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20
-; POST-PROCESS-CPS-NEXT:    [[TMP27:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP28:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP28]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(21) [[TMP29]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP30:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24
-; POST-PROCESS-CPS-NEXT:    [[TMP31:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP32:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP32]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(21) [[TMP33]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP34:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28
-; POST-PROCESS-CPS-NEXT:    [[TMP35:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP35]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP36]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP36:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(21) [[TMP37]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32
-; POST-PROCESS-CPS-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP40:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(21) [[TMP41]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP42:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36
-; POST-PROCESS-CPS-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP44:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP44]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(21) [[TMP45]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP46:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40
-; POST-PROCESS-CPS-NEXT:    [[TMP47:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP47]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP48:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(21) [[TMP49]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44
-; POST-PROCESS-CPS-NEXT:    [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP52:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(21) [[TMP53]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP54:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48
-; POST-PROCESS-CPS-NEXT:    [[TMP55:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP56:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP56]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(21) [[TMP57]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP58:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52
-; POST-PROCESS-CPS-NEXT:    [[TMP59:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP59]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(21) [[TMP61]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56
-; POST-PROCESS-CPS-NEXT:    [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP64:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(21) [[TMP65]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP66:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60
-; POST-PROCESS-CPS-NEXT:    [[TMP67:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP68:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP68]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(21) [[TMP69]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP70:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64
-; POST-PROCESS-CPS-NEXT:    [[TMP71:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP71]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP72:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(21) [[TMP73]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68
-; POST-PROCESS-CPS-NEXT:    [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP76:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP77:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP79:%.*]] = load i32, ptr addrspace(21) [[TMP77]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP78:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72
-; POST-PROCESS-CPS-NEXT:    [[TMP79:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP80:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP80]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP83:%.*]] = load i32, ptr addrspace(21) [[TMP81]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP82:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76
-; POST-PROCESS-CPS-NEXT:    [[TMP83:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP83]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP84:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP85:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(21) [[TMP85]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80
-; POST-PROCESS-CPS-NEXT:    [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP88:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP91:%.*]] = load i32, ptr addrspace(21) [[TMP89]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP90:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84
-; POST-PROCESS-CPS-NEXT:    [[TMP91:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP91]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP92:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP92]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP95:%.*]] = load i32, ptr addrspace(21) [[TMP93]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP94:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88
-; POST-PROCESS-CPS-NEXT:    [[TMP95:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP95]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP96:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP97:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP96]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP99:%.*]] = load i32, ptr addrspace(21) [[TMP97]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP98:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92
-; POST-PROCESS-CPS-NEXT:    [[TMP99:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP99]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP100:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP101:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP100]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP103:%.*]] = load i32, ptr addrspace(21) [[TMP101]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP102:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96
-; POST-PROCESS-CPS-NEXT:    [[TMP103:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP103]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP104:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP105:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP104]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP107:%.*]] = load i32, ptr addrspace(21) [[TMP105]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP106:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100
-; POST-PROCESS-CPS-NEXT:    [[TMP107:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP107]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP108:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP109:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP108]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP111:%.*]] = load i32, ptr addrspace(21) [[TMP109]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP110:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104
-; POST-PROCESS-CPS-NEXT:    [[TMP111:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP112:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP111]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP112]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP112:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP113:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP112]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP117:%.*]] = load i32, ptr addrspace(21) [[TMP113]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP114:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP114]], 0
 ; POST-PROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -5048,7 +5500,7 @@ attributes #3 = { nounwind }
 ;
 ;
 ; POST-PROCESS-CPS-LABEL: define void @ClosestHit(
-; POST-PROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !lgc.cps [[META22]] !continuation [[META26:![0-9]+]] !continuation.stacksize [[META21]] {
+; POST-PROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !lgc.cps [[META22]] !continuation [[META26:![0-9]+]] !continuation.stacksize [[META27:![0-9]+]] {
 ; POST-PROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POST-PROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POST-PROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5399,230 +5851,281 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29
-; POST-PROCESS-CPS-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP7]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(21) [[TMP8]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP7:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 0
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 1
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 2
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 3
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 4
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 5
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 6
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 7
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 8
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 9
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 10
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 11
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 12
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 13
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 14
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 15
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 16
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 17
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 18
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 19
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 20
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 21
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 22
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 23
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 24
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 25
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 26
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 27
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 28
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 29
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 30
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 31
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 32
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 33
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 34
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 35
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 36
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 37
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 38
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 39
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 40
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 41
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 42
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 43
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 44
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 45
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 46
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 47
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 48
+; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP7]], 0, 49
+; POST-PROCESS-CPS-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP8]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(21) [[TMP11]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP10:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4
-; POST-PROCESS-CPS-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP11]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(21) [[TMP12]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP12]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(21) [[TMP15]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8
-; POST-PROCESS-CPS-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP15]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(21) [[TMP16]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP16]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(21) [[TMP19]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP18:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12
-; POST-PROCESS-CPS-NEXT:    [[TMP19:%.*]] = inttoptr i32 [[TMP18]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP19]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(21) [[TMP20]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP20:%.*]] = inttoptr i32 [[TMP18]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP20]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(21) [[TMP23]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP22:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16
-; POST-PROCESS-CPS-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP22]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP23]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(21) [[TMP24]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP24:%.*]] = inttoptr i32 [[TMP22]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP24]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(21) [[TMP27]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20
-; POST-PROCESS-CPS-NEXT:    [[TMP27:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP28:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP28]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP31]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP30:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24
-; POST-PROCESS-CPS-NEXT:    [[TMP31:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP32:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP32]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP35]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP34:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28
-; POST-PROCESS-CPS-NEXT:    [[TMP35:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP35]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP36]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP36:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP39]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32
-; POST-PROCESS-CPS-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP40:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP43]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP42:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36
-; POST-PROCESS-CPS-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP44:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP44]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP47]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP46:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40
-; POST-PROCESS-CPS-NEXT:    [[TMP47:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP47]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP48:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP51]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44
-; POST-PROCESS-CPS-NEXT:    [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP52:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP55]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP54:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48
-; POST-PROCESS-CPS-NEXT:    [[TMP55:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP56:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP56]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP59]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP58:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52
-; POST-PROCESS-CPS-NEXT:    [[TMP59:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP59]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP63]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56
-; POST-PROCESS-CPS-NEXT:    [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP64:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP67]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP66:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60
-; POST-PROCESS-CPS-NEXT:    [[TMP67:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP68:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP68]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP71]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP70:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64
-; POST-PROCESS-CPS-NEXT:    [[TMP71:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP71]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP72:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP75]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68
-; POST-PROCESS-CPS-NEXT:    [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP76:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP79]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP78:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72
-; POST-PROCESS-CPS-NEXT:    [[TMP79:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP80:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP80]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP83]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP82:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76
-; POST-PROCESS-CPS-NEXT:    [[TMP83:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP83]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP84:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP87]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80
-; POST-PROCESS-CPS-NEXT:    [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP88:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP91:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP91]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP90:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84
-; POST-PROCESS-CPS-NEXT:    [[TMP91:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP91]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP92:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP92]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP95]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP94:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88
-; POST-PROCESS-CPS-NEXT:    [[TMP95:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP95]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP96:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP99:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP96]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP99]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP98:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92
-; POST-PROCESS-CPS-NEXT:    [[TMP99:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP99]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP100:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP103:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP100]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP103]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP102:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96
-; POST-PROCESS-CPS-NEXT:    [[TMP103:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP103]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP104:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP107:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP104]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP107]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP106:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100
-; POST-PROCESS-CPS-NEXT:    [[TMP107:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP107]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP108:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP111:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP108]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP111]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP110:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104
-; POST-PROCESS-CPS-NEXT:    [[TMP111:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP112:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP111]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP112]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP112:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP116:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP112]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP116]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP114:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT254:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP114]], 0
 ; POST-PROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POST-PROCESS-CPS-NEXT:    [[TMP115:%.*]] = add i32 [[TMP5]], 112
-; POST-PROCESS-CPS-NEXT:    [[TMP116:%.*]] = inttoptr i32 [[TMP115]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP117:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP116]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP117]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP117:%.*]] = inttoptr i32 [[TMP115]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP119:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP117]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP119]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP118:%.*]] = add i32 [[TMP5]], 108
-; POST-PROCESS-CPS-NEXT:    [[TMP119:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP120:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP119]], i32 0
-; POST-PROCESS-CPS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP120]], align 4
-; POST-PROCESS-CPS-NEXT:    [[TMP121:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP122:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP121]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP120:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP121:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP120]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP121]], align 4
+; POST-PROCESS-CPS-NEXT:    [[TMP124:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP122:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP124]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP9]], ptr addrspace(21) [[TMP122]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP123:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 4
-; POST-PROCESS-CPS-NEXT:    [[TMP124:%.*]] = inttoptr i32 [[TMP123]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP125:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP124]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP127:%.*]] = inttoptr i32 [[TMP123]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP125:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP127]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP13]], ptr addrspace(21) [[TMP125]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP126:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 8
-; POST-PROCESS-CPS-NEXT:    [[TMP127:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP128:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP127]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP130:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP128:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP130]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP17]], ptr addrspace(21) [[TMP128]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP129:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 12
-; POST-PROCESS-CPS-NEXT:    [[TMP130:%.*]] = inttoptr i32 [[TMP129]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP131:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP130]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP133:%.*]] = inttoptr i32 [[TMP129]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP131:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP133]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP21]], ptr addrspace(21) [[TMP131]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP132:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 16
-; POST-PROCESS-CPS-NEXT:    [[TMP133:%.*]] = inttoptr i32 [[TMP132]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP134:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP133]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP136:%.*]] = inttoptr i32 [[TMP132]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP134:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP136]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP25]], ptr addrspace(21) [[TMP134]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP135:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 20
-; POST-PROCESS-CPS-NEXT:    [[TMP136:%.*]] = inttoptr i32 [[TMP135]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP137:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP136]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP139:%.*]] = inttoptr i32 [[TMP135]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP137:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP139]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP29]], ptr addrspace(21) [[TMP137]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP138:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 24
-; POST-PROCESS-CPS-NEXT:    [[TMP139:%.*]] = inttoptr i32 [[TMP138]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP139]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP142:%.*]] = inttoptr i32 [[TMP138]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP142]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP33]], ptr addrspace(21) [[TMP140]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP141:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 28
-; POST-PROCESS-CPS-NEXT:    [[TMP142:%.*]] = inttoptr i32 [[TMP141]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP142]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP145:%.*]] = inttoptr i32 [[TMP141]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP145]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP37]], ptr addrspace(21) [[TMP143]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP144:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 32
-; POST-PROCESS-CPS-NEXT:    [[TMP145:%.*]] = inttoptr i32 [[TMP144]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP145]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP148:%.*]] = inttoptr i32 [[TMP144]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP148]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP41]], ptr addrspace(21) [[TMP146]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP147:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 36
-; POST-PROCESS-CPS-NEXT:    [[TMP148:%.*]] = inttoptr i32 [[TMP147]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP148]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP151:%.*]] = inttoptr i32 [[TMP147]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP151]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP45]], ptr addrspace(21) [[TMP149]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP150:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 40
-; POST-PROCESS-CPS-NEXT:    [[TMP151:%.*]] = inttoptr i32 [[TMP150]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP151]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP154:%.*]] = inttoptr i32 [[TMP150]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP154]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP49]], ptr addrspace(21) [[TMP152]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP153:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 44
-; POST-PROCESS-CPS-NEXT:    [[TMP154:%.*]] = inttoptr i32 [[TMP153]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP154]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP157:%.*]] = inttoptr i32 [[TMP153]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP157]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP53]], ptr addrspace(21) [[TMP155]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP156:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 48
-; POST-PROCESS-CPS-NEXT:    [[TMP157:%.*]] = inttoptr i32 [[TMP156]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP157]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP160:%.*]] = inttoptr i32 [[TMP156]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP160]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP57]], ptr addrspace(21) [[TMP158]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP159:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 52
-; POST-PROCESS-CPS-NEXT:    [[TMP160:%.*]] = inttoptr i32 [[TMP159]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP160]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP163:%.*]] = inttoptr i32 [[TMP159]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP163]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP61]], ptr addrspace(21) [[TMP161]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP162:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 56
-; POST-PROCESS-CPS-NEXT:    [[TMP163:%.*]] = inttoptr i32 [[TMP162]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP164:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP163]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP166:%.*]] = inttoptr i32 [[TMP162]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP164:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP166]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP65]], ptr addrspace(21) [[TMP164]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP165:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 60
-; POST-PROCESS-CPS-NEXT:    [[TMP166:%.*]] = inttoptr i32 [[TMP165]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP167:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP166]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP169:%.*]] = inttoptr i32 [[TMP165]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP167:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP169]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP69]], ptr addrspace(21) [[TMP167]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP168:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 64
-; POST-PROCESS-CPS-NEXT:    [[TMP169:%.*]] = inttoptr i32 [[TMP168]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP170:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP169]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP172:%.*]] = inttoptr i32 [[TMP168]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP170:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP172]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP73]], ptr addrspace(21) [[TMP170]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP171:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 68
-; POST-PROCESS-CPS-NEXT:    [[TMP172:%.*]] = inttoptr i32 [[TMP171]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP173:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP172]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP175:%.*]] = inttoptr i32 [[TMP171]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP173:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP175]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP77]], ptr addrspace(21) [[TMP173]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP174:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 72
-; POST-PROCESS-CPS-NEXT:    [[TMP175:%.*]] = inttoptr i32 [[TMP174]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP176:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP175]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP178:%.*]] = inttoptr i32 [[TMP174]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP176:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP178]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP81]], ptr addrspace(21) [[TMP176]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP177:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 76
-; POST-PROCESS-CPS-NEXT:    [[TMP178:%.*]] = inttoptr i32 [[TMP177]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP179:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP178]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP181:%.*]] = inttoptr i32 [[TMP177]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP179:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP181]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP85]], ptr addrspace(21) [[TMP179]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP180:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 80
-; POST-PROCESS-CPS-NEXT:    [[TMP181:%.*]] = inttoptr i32 [[TMP180]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP182:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP181]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP184:%.*]] = inttoptr i32 [[TMP180]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP182:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP184]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP89]], ptr addrspace(21) [[TMP182]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP183:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 84
-; POST-PROCESS-CPS-NEXT:    [[TMP184:%.*]] = inttoptr i32 [[TMP183]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP185:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP184]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP187:%.*]] = inttoptr i32 [[TMP183]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP185:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP187]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP93]], ptr addrspace(21) [[TMP185]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP186:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 88
-; POST-PROCESS-CPS-NEXT:    [[TMP187:%.*]] = inttoptr i32 [[TMP186]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP188:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP187]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP190:%.*]] = inttoptr i32 [[TMP186]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP188:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP190]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP97]], ptr addrspace(21) [[TMP188]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP189:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 92
-; POST-PROCESS-CPS-NEXT:    [[TMP190:%.*]] = inttoptr i32 [[TMP189]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP191:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP190]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP193:%.*]] = inttoptr i32 [[TMP189]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP191:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP193]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP101]], ptr addrspace(21) [[TMP191]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP192:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 96
-; POST-PROCESS-CPS-NEXT:    [[TMP193:%.*]] = inttoptr i32 [[TMP192]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP194:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP193]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP196:%.*]] = inttoptr i32 [[TMP192]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP194:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP196]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP105]], ptr addrspace(21) [[TMP194]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP195:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 100
-; POST-PROCESS-CPS-NEXT:    [[TMP196:%.*]] = inttoptr i32 [[TMP195]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP197:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP196]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP199:%.*]] = inttoptr i32 [[TMP195]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP197:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP199]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP109]], ptr addrspace(21) [[TMP197]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[TMP198:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 104
-; POST-PROCESS-CPS-NEXT:    [[TMP199:%.*]] = inttoptr i32 [[TMP198]] to ptr addrspace(21)
-; POST-PROCESS-CPS-NEXT:    [[TMP200:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP199]], i32 0
+; POST-PROCESS-CPS-NEXT:    [[TMP205:%.*]] = inttoptr i32 [[TMP198]] to ptr addrspace(21)
+; POST-PROCESS-CPS-NEXT:    [[TMP200:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP205]], i32 0
 ; POST-PROCESS-CPS-NEXT:    store i32 [[TMP113]], ptr addrspace(21) [[TMP200]], align 4
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT253:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT254]], 0
 ; POST-PROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0
@@ -5853,86 +6356,137 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 27
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 28
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 29
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[DOTFCA_0_EXTRACT]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(22) [[TMP9]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP9:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 0
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 1
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 2
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 3
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 5
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 6
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 7
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 8
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 9
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 10
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 11
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 12
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 13
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 14
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 15
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 16
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 17
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 18
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 19
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 20
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 21
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 22
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 23
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 24
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 25
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 26
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 27
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 28
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 29
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 30
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 31
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 32
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 33
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 34
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 35
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 36
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 37
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 38
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 39
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 40
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 41
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 42
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 43
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 44
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 45
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 46
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 47
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 48
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 49
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[DOTFCA_0_EXTRACT]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(22) [[TMP10]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP11:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP11]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(22) [[TMP12]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP11]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(22) [[TMP13]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP14]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(22) [[TMP15]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP14]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(22) [[TMP16]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP17:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP17]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(22) [[TMP18]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP17]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(22) [[TMP19]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP20:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP20]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(22) [[TMP21]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP20]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(22) [[TMP22]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP23:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP23]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(22) [[TMP24]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP23]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(22) [[TMP25]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP26]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(22) [[TMP27]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP26]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP28]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP29:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP29]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(22) [[TMP30]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP29]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP31]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP32:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP32]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(22) [[TMP33]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP32]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP34]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP35:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP35]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(22) [[TMP36]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP35]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP37]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP38]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(22) [[TMP39]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP38]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP40]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP41:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP41]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(22) [[TMP42]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP41]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP43]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP44:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP44]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(22) [[TMP45]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP44]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP46]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP47:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP47]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(22) [[TMP48]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP47]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP49]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP50]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(22) [[TMP51]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP50]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP52]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP53:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP53]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(22) [[TMP54]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP53]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(22) [[TMP55]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP56:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP56]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP58:%.*]] = load i32, ptr addrspace(22) [[TMP57]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP56]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(22) [[TMP58]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP59:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP59]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(22) [[TMP60]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP59]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(22) [[TMP61]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP62]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(22) [[TMP63]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP62]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(22) [[TMP64]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP65:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP65]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(22) [[TMP66]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP65]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(22) [[TMP67]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP68:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP68]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP70:%.*]] = load i32, ptr addrspace(22) [[TMP69]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP70:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP68]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(22) [[TMP70]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP71:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP71]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(22) [[TMP72]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP71]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(22) [[TMP73]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP74]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP76:%.*]] = load i32, ptr addrspace(22) [[TMP75]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP74]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(22) [[TMP76]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP77:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP77]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP79:%.*]] = load i32, ptr addrspace(22) [[TMP78]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP77]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(22) [[TMP79]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP80:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP80]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP82:%.*]] = load i32, ptr addrspace(22) [[TMP81]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP80]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP84:%.*]] = load i32, ptr addrspace(22) [[TMP82]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP83:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP83]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(22) [[TMP84]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP85:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP83]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(22) [[TMP85]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP86]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP88:%.*]] = load i32, ptr addrspace(22) [[TMP87]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP86]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP92:%.*]] = load i32, ptr addrspace(22) [[TMP88]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP89:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP89]], 0
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -6261,7 +6815,7 @@ attributes #3 = { nounwind }
 ;
 ;
 ; POST-PROCESS-GLOBAL-CPS-LABEL: define void @ClosestHit(
-; POST-PROCESS-GLOBAL-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !lgc.cps [[META22]] !continuation [[META26:![0-9]+]] !continuation.stacksize [[META21]] {
+; POST-PROCESS-GLOBAL-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !lgc.cps [[META22]] !continuation [[META26:![0-9]+]] !continuation.stacksize [[META27:![0-9]+]] {
 ; POST-PROCESS-GLOBAL-CPS-NEXT:  AllocaSpillBB:
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -6560,95 +7114,146 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 27
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 28
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 29
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[DOTFCA_0_EXTRACT]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(22) [[TMP9]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP9:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 0
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 1
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 2
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 3
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 5
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_6_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 6
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_7_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 7
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_8_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 8
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_9_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 9
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_10_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 10
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_11_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 11
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_12_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 12
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_13_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 13
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_14_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 14
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_15_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 15
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_16_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 16
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_17_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 17
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_18_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 18
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_19_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 19
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_20_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 20
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_21_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 21
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_22_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 22
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_23_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 23
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_24_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 24
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_25_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 25
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_26_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 26
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_27_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 27
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_28_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 28
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_29_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 29
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_30_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 30
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_31_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 31
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_32_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 32
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_33_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 33
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_34_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 34
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_35_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 35
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_36_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 36
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_37_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 37
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_38_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 38
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_39_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 39
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_40_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 40
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_41_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 41
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_42_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 42
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_43_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 43
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_44_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 44
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_45_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 45
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_46_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 46
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_47_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 47
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_48_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 48
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_49_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP9]], 0, 49
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[DOTFCA_0_EXTRACT]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(22) [[TMP12]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP11:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP11]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(22) [[TMP12]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP11]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(22) [[TMP15]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP14]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(22) [[TMP15]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP14]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(22) [[TMP18]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP17:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP17]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(22) [[TMP18]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP17]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(22) [[TMP21]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP20:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP20]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(22) [[TMP21]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP20]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(22) [[TMP24]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP23:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP23]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(22) [[TMP24]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP23]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(22) [[TMP27]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP26]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(22) [[TMP27]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP26]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(22) [[TMP30]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP29:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP29]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(22) [[TMP30]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP29]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(22) [[TMP33]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP32:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP32]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(22) [[TMP33]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP32]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(22) [[TMP36]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP35:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP35]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(22) [[TMP36]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP35]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(22) [[TMP39]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP38]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(22) [[TMP39]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP38]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(22) [[TMP42]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP41:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP41]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(22) [[TMP42]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP41]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(22) [[TMP45]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP44:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP44]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(22) [[TMP45]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP44]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(22) [[TMP48]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP47:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP47]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(22) [[TMP48]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP47]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(22) [[TMP51]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP50]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(22) [[TMP51]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP50]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(22) [[TMP54]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP53:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP53]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(22) [[TMP54]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP53]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(22) [[TMP57]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP56:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP56]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP58:%.*]] = load i32, ptr addrspace(22) [[TMP57]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP56]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP58:%.*]] = load i32, ptr addrspace(22) [[TMP60]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP59:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP59]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(22) [[TMP60]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP59]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(22) [[TMP63]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP62]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(22) [[TMP63]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP62]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(22) [[TMP66]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP65:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP65]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(22) [[TMP66]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP65]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(22) [[TMP69]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP68:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP68]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP70:%.*]] = load i32, ptr addrspace(22) [[TMP69]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP68]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP70:%.*]] = load i32, ptr addrspace(22) [[TMP72]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP71:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP71]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(22) [[TMP72]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP71]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(22) [[TMP75]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP74]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP76:%.*]] = load i32, ptr addrspace(22) [[TMP75]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP74]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP76:%.*]] = load i32, ptr addrspace(22) [[TMP78]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP77:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP77]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP79:%.*]] = load i32, ptr addrspace(22) [[TMP78]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP77]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP79:%.*]] = load i32, ptr addrspace(22) [[TMP81]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP80:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP80]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP82:%.*]] = load i32, ptr addrspace(22) [[TMP81]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP80]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP82:%.*]] = load i32, ptr addrspace(22) [[TMP84]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP83:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP83]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(22) [[TMP84]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP83]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(22) [[TMP87]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP86]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP88:%.*]] = load i32, ptr addrspace(22) [[TMP87]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP91:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP86]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP88:%.*]] = load i32, ptr addrspace(22) [[TMP91]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP89:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[DOTFCA_0_EXTRACT254:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP89]], 0
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP90:%.*]] = add i32 [[TMP7]], 112
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP91:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP90]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP91]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP90]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP93]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP92:%.*]] = add i32 [[TMP7]], 108
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP92]]
-; POST-PROCESS-GLOBAL-CPS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP93]], align 4
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP151:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP92]]
+; POST-PROCESS-GLOBAL-CPS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP151]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP94:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]]
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    store i32 [[TMP10]], ptr addrspace(22) [[TMP94]], align 4
 ; POST-PROCESS-GLOBAL-CPS-NEXT:    [[TMP95:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 4
diff --git a/llvmraytracing/test/dx/remat-intrinsic.ll b/llvmraytracing/test/dx/remat-intrinsic.ll
index 5a538b179f..f0cf8e5df2 100644
--- a/llvmraytracing/test/dx/remat-intrinsic.ll
+++ b/llvmraytracing/test/dx/remat-intrinsic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \
 ; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
@@ -169,6 +169,8 @@ attributes #1 = { nounwind }
 ; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP13]], -8
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 2
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0
+; POSTPROCESS-NEXT:    [[TMP14:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP14]], 0
 ; POSTPROCESS-NEXT:    [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [2 x i32], [1 x i32] } [[TMP1]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP15]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
diff --git a/llvmraytracing/test/dx/remove-unused-declarations.ll b/llvmraytracing/test/dx/remove-unused-declarations.ll
index f53df7b21a..03fc42b3c2 100644
--- a/llvmraytracing/test/dx/remove-unused-declarations.ll
+++ b/llvmraytracing/test/dx/remove-unused-declarations.ll
@@ -1,5 +1,5 @@
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-DECL %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-DECL %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-DECL %s
 
 target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
diff --git a/llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll b/llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll
new file mode 100644
index 0000000000..d2cdfa6449
--- /dev/null
+++ b/llvmraytracing/test/dx/specialize-driver-shaders/analysis.ll
@@ -0,0 +1,483 @@
+; RUN: opt --verify-each -passes='specialize-driver-shaders' -S %s -debug-only='specialize-driver-shaders' 2>&1 | FileCheck %s
+;
+; REQUIRES: assertions
+
+; Intentionally align i64 to 64 bits so we can test analysis of args that contain padding in memory,
+; where the in-register layout in the calling convention does not match the memory layout.
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+; This tests focuses on the preserved-argument analysis for different shader kinds, including await handling.
+; For that, we don't care about specific argument details, and thus use an i32 array most of the time.
+%args.type = type { [31 x i32] }
+; Awaits wrap args into a struct, even if it is just a single one
+%awaited.args.type = type { %args.type }
+%args.with.padding = type { i32, i64, { i32, i64 } }
+
+; Ignored prefix args: shaderAddr, levels, state, returnAddr, shaderRecIdx
+declare void @lgc.cps.jump(...)
+; Ignored prefix args: shaderAddr, levels, shaderRecIdx
+; The __ suffix is required to let the dialect visitor detect this as an overload of the await op.
+declare %awaited.args.type @lgc.cps.await__(...)
+declare %args.with.padding @lgc.cps.await__p(...)
+declare { <2 x i16> } @lgc.cps.await__2xi16(...)
+declare { i16, i16 } @lgc.cps.await__i16i16(...)
+declare { i32 } @lgc.cps.await__i32(...)
+
+; Legacy await:
+declare %awaited.args.type @await(...)
+declare %args.type @opaque(...)
+
+; Simple AHS that just forwards args
+; CHECK-LABEL: [SDS] Analyzing function AnyHit1
+define void @AnyHit1({}, i32, i32, %args.type %args) !lgc.rt.shaderstage !2 {
+; CHECK-NEXT: [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %args)
+  unreachable
+; CHECK-NEXT: [SDS] Finished analysis of function AnyHit1
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+}
+
+; Single-jump AHS that:
+;  * swaps the first two dwords (dynamic)
+;  * writes constant to dword 10 (constant)
+;  * writes poison to dword 11 (undef)
+;  * condiditionally writes constant to dword 20 (constant)
+;  * condiditionally writes undef to dword 21 (undef)
+;  * condiditionally writes undef or constant to dword 22 (constant)
+;  * condiditionally writes constant or dynamic to dword 23 (dynamic)
+;  * writes same constants to dword 25 (constant)
+;  * writes different constants to dword 26 (dynamic)
+; CHECK-LABEL: [SDS] Analyzing function AnyHit2
+define void @AnyHit2({}, i32, i32, %args.type %args) !lgc.rt.shaderstage !2 {
+entry:
+  %dw0 = extractvalue %args.type %args, 0, 0
+  %dw1 = extractvalue %args.type %args, 0, 1
+  %tmp0 = insertvalue %args.type %args, i32 %dw1, 0, 0
+  %tmp1 = insertvalue %args.type %tmp0, i32 %dw0, 0, 1
+  %tmp2 = insertvalue %args.type %tmp1, i32 -1, 0, 10
+  %tmp3 = insertvalue %args.type %tmp2, i32 poison, 0, 11
+  %tmp4 = insertvalue %args.type %tmp3, i32 undef, 0, 22
+  %dynamic = load i32, ptr null
+  %tmp5 = insertvalue %args.type %tmp4, i32 %dynamic, 0, 23
+  %tmp6 = insertvalue %args.type %tmp5, i32 -1, 0, 25
+  %tmp7 = insertvalue %args.type %tmp6, i32 0, 0, 26
+  %cond = trunc i32 %dw0 to i1
+  br i1 %cond, label %conditional, label %exit
+conditional:
+  %tmp8 = insertvalue %args.type %tmp7, i32 0, 0, 20
+  %tmp9 = insertvalue %args.type %tmp8, i32 undef, 0, 21
+  %tmp10 = insertvalue %args.type %tmp9, i32 -1, 0, 22
+  %tmp11 = insertvalue %args.type %tmp10, i32 -1, 0, 23
+  %tmp12 = insertvalue %args.type %tmp11, i32 -1, 0, 25
+  %tmp13 = insertvalue %args.type %tmp12, i32 -1, 0, 26
+  br label %exit
+exit:
+  %args.final = phi %args.type [ %tmp13, %conditional ], [ %tmp7, %entry ]
+; CHECK-NEXT: [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args.final)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] DDPPPPPPPPCUPPPPPPPPCUCDPCDPPPP{{$}}
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %args.final)
+  unreachable
+}
+
+; Two-jump AHS that does different things on the two jumps, testing merging of states
+; across jumps works correctly:
+;  * write constant to dword 0 only on Jump0
+;  * write constant to dword 1 only on Jump1
+;  * write matching constants to dword 2
+; CHECK-LABEL: [SDS] Analyzing function AnyHit3
+;  * write non-matching constants to dword 3
+define void @AnyHit3({}, i32, i32, %args.type %args) !lgc.rt.shaderstage !2 {
+entry:
+  %dw0 = extractvalue %args.type %args, 0, 0
+  %cond = trunc i32 %dw0 to i1
+  br i1 %cond, label %exit0, label %exit1
+exit0:
+  %tmp0 = insertvalue %args.type %args, i32 -1, 0, 0
+  %tmp1 = insertvalue %args.type %tmp0, i32 -1, 0, 2
+  %tmp2 = insertvalue %args.type %tmp1, i32 -1, 0, 3
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %tmp2)
+  unreachable
+exit1:
+  %tmp3 = insertvalue %args.type %args, i32 -1, 0, 1
+  %tmp4 = insertvalue %args.type %tmp3, i32 -1, 0, 2
+  %tmp5 = insertvalue %args.type %tmp4, i32 -2, 0, 3
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %tmp5)
+  unreachable
+; CHECK:      [SDS] Finished analysis of function AnyHit3
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] CCCDPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+}
+
+; Intersection with an await call simulating a ReportHit call.
+; Check that values passed to await are checked and accounted for in the preserved state,
+; and that using values returned from await counts as preserved.
+; Also check that using original argument values in awaits after awaits still count as preserved.
+; Note: This is only possible because we run before coro passes, after coro passes such values
+; would be loaded from continuation state and their origin unknown.
+; This uses lgc.cps.await.
+; CHECK-LABEL: [SDS] Analyzing function Intersection1
+define void @Intersection1({}, i32, i32, %args.type %args) !lgc.rt.shaderstage !1 {
+entry:
+  %dw0 = extractvalue %args.type %args, 0, 0
+  %cond = trunc i32 %dw0 to i1
+  br i1 %cond, label %conditional, label %exit
+conditional:
+; Pass through args, trivially all-preserve
+; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.0.struct {{.*}}lgc.cps.await{{.*}} %args)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  %awaited.0.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %args)
+  %awaited.0 = extractvalue %awaited.args.type %awaited.0.struct, 0
+; Pass awaited results. Should still be all-preserve. This tests awaited results are correctly handled.
+; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.1.struct {{.*}}lgc.cps.await{{.*}} %awaited.0)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  %awaited.1.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %awaited.0)
+  %awaited.1 = extractvalue %awaited.args.type %awaited.1.struct, 0
+  %awaited.merged = insertvalue %args.type %awaited.1, i32 %dw0, 0, 0
+; Reuse incoming dword 0. Should still be preserved.
+; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.2.struct {{.*}}lgc.cps.await{{.*}} %awaited.merged)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  %awaited.2.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %awaited.merged)
+  %awaited.2 = extractvalue %awaited.args.type %awaited.2.struct, 0
+  br label %exit
+exit:
+  %args.final = phi %args.type [ %awaited.2, %conditional ], [ %args, %entry ]
+; CHECK-NEXT: [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args.final)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %args.final)
+  unreachable
+}
+
+; Basic test that legacy await is also handled.
+; Note: This test is a bit odd, because this is an lgc.cps module, and we only expect legacy awaits in non-lgc.cps modules.
+; Thus, we use the lgc.cps mode version of lgc.cps.jump including a to-be-ignored shader record index.
+; CHECK-LABEL: [SDS] Analyzing function Intersection2
+define void @Intersection2({}, i32, i32, %args.type %args) !lgc.rt.shaderstage !1 {
+  %handle = call ptr inttoptr (i32 poison to ptr)(%args.type %args)
+  %awaited = call %args.type @await(ptr %handle)
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %awaited)
+  ret void
+; CHECK:      [SDS] Finished analysis of function Intersection2
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+}
+
+; Check that other function calls to non-await functions are not accidentally considered as preserved.
+; CHECK-LABEL: [SDS] Analyzing function Intersection3
+define void @Intersection3({}, i32, i32, %args.type %args) !lgc.rt.shaderstage !1 {
+  %not.awaited = call %args.type @opaque(%args.type %args)
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %not.awaited)
+  ret void
+; CHECK:      [SDS] Finished analysis of function Intersection3
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD{{$}}
+}
+
+; Check that with awaits and phi nodes, we apply the value origin analysis to incoming values of phi nodes,
+; and not directly compare against incoming args and await results.
+; Check both the dynamic and constant case: Change dw0 dynamically, and dw1 to a constant.
+; Then conditionally await, and at the end jump using either the modified %args value or the await result.
+; The jump argument will be a phi result, and the incoming value
+; needs to go through value origin tracking to determine it's in fact
+; the incoming function argument, except for the modified dword.
+; We use two conditional awaits so also in the constant case (dw1), there are multiple
+; dynamic values coming into the phi node. With just a single one, value origin tracking
+; can see through the phi node and our phi node handling is not triggered.
+; CHECK-LABEL: [SDS] Analyzing function Intersection4
+define void @Intersection4({}, i32, i32, %args.type %args) !lgc.rt.shaderstage !1 {
+entry:
+  %dw1 = extractvalue %args.type %args, 0, 1
+  %args.modified.0 = insertvalue %args.type %args, i32 %dw1, 0, 0
+  %args.modified = insertvalue %args.type %args.modified.0, i32 0, 0, 1
+  ;%args.modified = insertvalue %args.type %args, i32 1337, 0, 0
+  %cond = trunc i32 %dw1 to i1
+  switch i32 %dw1, label %exit [
+    i32 0, label %conditional.0
+    i32 1, label %conditional.1
+  ]
+conditional.0:
+; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.0.struct {{.*}}lgc.cps.await{{.*}} %args.modified)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] DCPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  %awaited.0.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %args.modified)
+  %awaited.0 = extractvalue %awaited.args.type %awaited.0.struct, 0
+  br label %exit
+conditional.1:
+; CHECK-NEXT: [SDS] Analyzed outgoing call   %awaited.1.struct {{.*}}lgc.cps.await{{.*}} %args.modified)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] DCPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  %awaited.1.struct = call %awaited.args.type (...) @lgc.cps.await__(i32 poison, i32 poison, i32 poison, %args.type %args.modified)
+  %awaited.1 = extractvalue %awaited.args.type %awaited.1.struct, 0
+  br label %exit
+exit:
+  %args.final = phi %args.type [ %awaited.0, %conditional.0 ], [ %awaited.1, %conditional.1 ], [ %args.modified, %entry ]
+; CHECK:      [SDS] Analyzed outgoing call {{.*}} @lgc.cps.jump({{.*}} %args.final)
+; CHECK-NEXT: [SDS] 0         1         2         3{{$}}
+; CHECK-NEXT: [SDS] 0123456789012345678901234567890{{$}}
+; CHECK-NEXT: [SDS] DCPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %args.final)
+  unreachable
+}
+
+declare [4 x i32] @opaqueCandidate()
+
+; Traversal shader that contains jumps to an AHS setting a dynamic candidate, and a return back to raygen that preserves only parts of the args.
+; CHECK-LABEL: [SDS] Analyzing function Traversal1 (shader stage compute)
+define void @Traversal1({}, i32 %ret.addr, i32, { [2 x i32], [8 x i32] } %system.data, [4 x i32] %padding, [8 x i32] %payload) !lgc.rt.shaderstage !6 {
+  %cond = trunc i32 %ret.addr to i1
+  br i1 %cond, label %rgs.resume, label %ahs
+ahs:
+  %ahs.system.data.0 = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } poison, { [2 x i32], [8 x i32] } %system.data, 0
+  %candidate = call [4 x i32] @opaqueCandidate()
+  %ahs.system.data = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data.0, [4 x i32] %candidate, 1
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data, [8 x i32] %payload)
+  unreachable
+rgs.resume:
+  %dispatch.system.data = extractvalue { [2 x i32], [8 x i32] } %system.data, 0
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, [2 x i32] %dispatch.system.data, [12 x i32] poison, [8 x i32] %payload)
+  unreachable
+; CHECK-LABEL: [SDS] Finished analysis of function Traversal1
+; CHECK-NEXT:  [SDS] 0         1         2
+; CHECK-NEXT:  [SDS] 0123456789012345678901
+; CHECK-NEXT:  [SDS] PPUUUUUUUUDDDDPPPPPPPP
+}
+
+; Same as above, but without padding args.
+; Hypothetical traversal calling an AHS with a larger arg size, and a RGS with smaller arg size.
+; This tests mismatching incoming vs outgoing arg sizes.
+; CHECK-LABEL: [SDS] Analyzing function Traversal2 (shader stage compute)
+define void @Traversal2({}, i32 %ret.addr, i32, { [2 x i32], [8 x i32] } %system.data, [8 x i32] %payload) !lgc.rt.shaderstage !6 {
+  %cond = trunc i32 %ret.addr to i1
+  br i1 %cond, label %rgs.resume, label %ahs
+ahs:
+  %ahs.system.data.0 = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } poison, { [2 x i32], [8 x i32] } %system.data, 0
+  %candidate = call [4 x i32] @opaqueCandidate()
+  %ahs.system.data = insertvalue { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data.0, [4 x i32] %candidate, 1
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data, [8 x i32] %payload)
+; CHECK-NEXT: [SDS] 0         1         2
+; CHECK-NEXT: [SDS] 0123456789012345678901
+; CHECK-NEXT: [SDS] PPPPPPPPPPDDDDDDDDDDDD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, { { [2 x i32], [8 x i32] }, [4 x i32] } %ahs.system.data, [8 x i32] %payload)
+  unreachable
+rgs.resume:
+  %dispatch.system.data = extractvalue { [2 x i32], [8 x i32] } %system.data, 0
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, [2 x i32] %dispatch.system.data, [8 x i32] %payload)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 0123456789
+; CHECK-NEXT: [SDS] PPDDDDDDDD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, [2 x i32] %dispatch.system.data, [8 x i32] %payload)
+  unreachable
+; CHECK-NEXT: [SDS] Finished analysis of function Traversal2
+; CHECK-NEXT: [SDS] 0         1         2
+; CHECK-NEXT: [SDS] 0123456789012345678901
+; CHECK-NEXT: [SDS] PPDDDDDDDDDDDDDDDDDDDD
+}
+
+; %args.with.padding requires 6 registers as argument, but 8 dwords in memory
+; Test that we correctly map the argument slots into the in-memory type layout,
+; by extracting the individual dword values, and passing them as scalars to an outgoing jump.
+; This should be detected as preserve.
+; CHECK-LABEL: [SDS] Analyzing function JumpWithPaddingInType
+define void @JumpWithPaddingInType({}, i32 %ret.addr, i32, %args.with.padding %args) !lgc.rt.shaderstage !2 {
+  %scalar.0 = extractvalue %args.with.padding %args, 0
+  %scalar.1 = extractvalue %args.with.padding %args, 1
+  %scalar.2 = extractvalue %args.with.padding %args, 2, 0
+  %scalar.3 = extractvalue %args.with.padding %args, 2, 1
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i32 %scalar.0, i64 %scalar.1, i32 %scalar.2, i64 %scalar.3)
+  unreachable
+; CHECK-LABEL: [SDS] Finished analysis of function JumpWithPaddingInType
+; CHECK-NEXT:  [SDS] 0
+; CHECK-NEXT:  [SDS] 012345
+; CHECK-NEXT:  [SDS] PPPPPP
+}
+
+; Same as above, but for awaits results.
+; CHECK-LABEL: [SDS] Analyzing function AwaitWithPaddingInType
+define void @AwaitWithPaddingInType({}, i32 %ret.addr, i32, %args.with.padding %args) !lgc.rt.shaderstage !1 {
+  ; Intentionally do not wrap %args in a struct -- instead pretend the await function returns
+  ; the elements of %args as separate args, so we can test the mapping of arg slots into the returned struct
+  ; with multiple struct elements.
+  %awaited = call %args.with.padding (...) @lgc.cps.await__p(i32 poison, i32 poison, i32 poison, %args.with.padding %args)
+  %scalar.0 = extractvalue %args.with.padding %awaited, 0
+  %scalar.1 = extractvalue %args.with.padding %awaited, 1
+  %scalar.2 = extractvalue %args.with.padding %awaited, 2, 0
+  %scalar.3 = extractvalue %args.with.padding %awaited, 2, 1
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i32 %scalar.0, i64 %scalar.1, i32 %scalar.2, i64 %scalar.3)
+  unreachable
+; CHECK-LABEL: [SDS] Finished analysis of function AwaitWithPaddingInType
+; CHECK-NEXT:  [SDS] 0
+; CHECK-NEXT:  [SDS] 012345
+; CHECK-NEXT:  [SDS] PPPPPP
+}
+
+; Check that we don't treat a single passed-through i16 as preserve. The high outgoing bits are poison,
+; so in theory we could treat this as preserve, because only non-poison bits are relevant for the analysis,
+; but currently we handle i16s conservatively. Properly supporting i16s is complicated, because incoming poison
+; bits that might even be implicit in the in-memory representation of a type need to be accounted for.
+; For instance, consider the example that forwards an incoming <2 x i16> argument to a bitcast outgoing i32 argument
+; in the JumpWithOverlappingi16s test case.
+; CHECK-LABEL: [SDS] Analyzing function JumpWithSinglei16
+define void @JumpWithSinglei16({}, i32 %ret.addr, i32, i16 %arg) !lgc.rt.shaderstage !2 {
+; Forward arg as-is.
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %arg)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] D
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %arg)
+  unreachable
+}
+
+; Check that we don't treat a misaligned passed-through dword as preserve. Use a packed struct to force misalignment.
+; CHECK-LABEL: [SDS] Analyzing function JumpWithMisalignedDword
+define void @JumpWithMisalignedDword({}, i32 %ret.addr, i32, <{ i16, i32 }> %args) !lgc.rt.shaderstage !2 {
+  switch i32 %ret.addr, label %conditional.0 [
+    i32 0, label %conditional.0
+    i32 1, label %conditional.1
+  ]
+conditional.0:
+; Forward args as-is.
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, <{ i16, i32 }> %args)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 01
+; CHECK-NEXT: [SDS] DD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, <{ i16, i32 }> %args)
+  unreachable
+conditional.1:
+; Forward extracted scalars.
+  %scalar.0 = extractvalue <{ i16, i32 }> %args, 0
+  %scalar.1 = extractvalue <{ i16, i32 }> %args, 1
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i32 %scalar.1)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 01
+; CHECK-NEXT: [SDS] DD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i32 %scalar.1)
+  unreachable
+  unreachable
+}
+
+; All cases involving i16 scalars should not be treated as preserve, as the i16 cannot guarantee to preserve high bits.
+; Additionally, there can be issues with alignment.
+; CHECK-LABEL: [SDS] Analyzing function JumpWithOverlappingi16s
+define void @JumpWithOverlappingi16s({}, i32 %ret.addr, i32, <2 x i16> %args) !lgc.rt.shaderstage !2 {
+  switch i32 %ret.addr, label %conditional.2 [
+    i32 0, label %conditional.0
+    i32 1, label %conditional.1
+    i32 2, label %conditional.2
+  ]
+conditional.0:
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, <2 x i16> %args)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 01
+; CHECK-NEXT: [SDS] DD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, <2 x i16> %args)
+  unreachable
+conditional.1:
+; Forward extracted scalars. This preserves arg slots, but we can't detect it.
+  %scalar.0 = extractelement <2 x i16> %args, i32 0
+  %scalar.1 = extractelement <2 x i16> %args, i32 1
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i16 %scalar.1)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 01
+; CHECK-NEXT: [SDS] DD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %scalar.0, i16 %scalar.1)
+  unreachable
+conditional.2:
+; Forward just the bitcast. This does *not* preserve arg slots, as we merge both i16s into a single i32 arg slot.
+; Even when relaxing i16 handling and allowing to treat forwarded i16 arguments as preserve, exploiting that the high bits
+; are poison, we may not treat this as preserve. A naive implementation that just compares the value origin of the
+; outgoing %bitcast argument with the corresponding incoming argument slot (value %args, offset 0) might come to the conclusion that it is
+; preserved. But when allowing i16s, we need to additionally account for the incoming high poison bits that are implicit
+; in the in-memory representation of %args.
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] D
+  %bitcast = bitcast <2 x i16> %args to i32
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
+  unreachable
+}
+
+; Same as above, but with awaits.
+; CHECK-LABEL: [SDS] Analyzing function AwaitWithOverlappingi16s
+define void @AwaitWithOverlappingi16s({}, i32 %ret.addr, i32, <2 x i16> %args) !lgc.rt.shaderstage !2 {
+  switch i32 %ret.addr, label %conditional.2 [
+    i32 0, label %conditional.0
+    i32 1, label %conditional.1
+    i32 2, label %conditional.2
+  ]
+conditional.0:
+; Forward args as-is through an await.
+  %awaited.0.struct = call { <2 x i16> } (...) @lgc.cps.await__2xi16(i32 poison, i32 poison, i32 poison, <2 x i16> %args)
+  %awaited.0 = extractvalue { <2 x i16> } %awaited.0.struct, 0
+; CHECK-LABEL: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, <2 x i16> %awaited.0)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 01
+; CHECK-NEXT: [SDS] DD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, <2 x i16> %awaited.0)
+  unreachable
+conditional.1:
+; Forward extracted scalars through an await.
+  %scalar.0 = extractelement <2 x i16> %args, i32 0
+  %scalar.1 = extractelement <2 x i16> %args, i32 1
+  %awaited.1.struct = call { i16, i16 } (...) @lgc.cps.await__i16i16(i32 poison, i32 poison, i32 poison, i16 %scalar.0, i16 %scalar.1)
+  %awaited.1.0 = extractvalue { i16, i16 } %awaited.1.struct, 0
+  %awaited.1.1 = extractvalue { i16, i16 } %awaited.1.struct, 1
+; CHECK:      [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %awaited.1.0, i16 %awaited.1.1)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 01
+; CHECK-NEXT: [SDS] DD
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i16 %awaited.1.0, i16 %awaited.1.1)
+  unreachable
+conditional.2:
+; Forward just the bitcast. This does *not* preserve arg slots, as we merge both i16s into a single arg slot.
+; CHECK-NEXT: [SDS] Analyzed outgoing call   call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] 0
+; CHECK-NEXT: [SDS] D
+  %bitcast = bitcast <2 x i16> %args to i32
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, i32 %bitcast)
+  unreachable
+}
+
+; Check that we ignore callable shaders
+define void @Callable({}, i32 %ret.addr, i32, %args.type %args) !lgc.rt.shaderstage !5 {
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %args)
+  unreachable
+; CHECK-NOT: [SDS] Finished analysis of function Callable
+}
+
+; Check that we ignore launch kernel shaders
+define void @LaunchKernel({}, i32 %ret.addr, i32, %args.type %args) !lgc.rt.shaderstage !7 {
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 poison, %args.type %args)
+  unreachable
+; CHECK-NOT: [SDS] Finished analysis of function LaunchKernel
+}
+
+; CHECK: [SDS] Serialized state to MD:
+!lgc.cps.module = !{}
+!lgc.rt.specialize.driver.shaders.process.in.instruction.order = !{}
+
+!1 = !{i32 1} ; Intersection
+!2 = !{i32 2} ; AHS
+!5 = !{i32 5} ; Callable
+!6 = !{i32 6} ; Traversal
+!7 = !{i32 7} ; KernelEntry
diff --git a/llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll b/llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll
new file mode 100644
index 0000000000..576a600141
--- /dev/null
+++ b/llvmraytracing/test/dx/specialize-driver-shaders/lower-rt-pipeline-args.ll
@@ -0,0 +1,467 @@
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,sroa,specialize-driver-shaders,lint,remove-types-metadata" -S --lint-abort-on-error -debug-only='specialize-driver-shaders' %s 2>&1 | FileCheck %s
+;
+; Test that argument layouts (number of ignored arguments) expected in specialize-driver-shaders matches what lower-raytracing-pipeline does.
+; Intentionally only test non-lgc.cps-mode, as lgc.cps mode requires different arguments in test IR,
+; and as it is already tested as part of an LLPC offline pipeline compilation test.
+;
+; REQUIRES: assertions
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+%dx.types.Handle = type { i8* }
+%struct.DispatchSystemData = type { <3 x i32> }
+%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i64 }
+%struct.SystemData = type { %struct.DispatchSystemData }
+%struct.HitData = type { <3 x float>, <3 x float>, float, i32 }
+%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
+%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
+%struct.RayPayload = type { <4 x float> }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.BuiltInTriangleIntersectionAttributes2 = type { <2 x float> }
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
+
+@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
+
+define i32 @_cont_GetContinuationStackAddr() #0 {
+  ret i32 0
+}
+
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !{%struct.DispatchSystemData poison} {
+  ret void
+}
+
+declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0
+
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, i64, %struct.DispatchSystemData) #0
+
+declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, i64, %struct.AnyHitTraversalData, float, i32) #0
+
+define %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData* %data) #0 !pointeetys !32 {
+  %resPtr = getelementptr %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0
+  %res = load %struct.HitData, %struct.HitData* %resPtr, align 4
+  ret %struct.HitData %res
+}
+
+declare !pointeetys !34 %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #0
+
+declare !pointeetys !36 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
+
+define void @_cont_SetTriangleHitAttributes(%struct.SystemData* %data, %struct.BuiltInTriangleIntersectionAttributes %val) !pointeetys !37 {
+  %addr = getelementptr %struct.SystemData, %struct.SystemData* %data, i32 0, i32 0
+  store %struct.BuiltInTriangleIntersectionAttributes %val, %struct.BuiltInTriangleIntersectionAttributes* %addr, align 4
+  ret void
+}
+
+define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !pointeetys !38 {
+  ret i32 5
+}
+
+declare i1 @opaqueIsEnd()
+
+define i1 @_cont_IsEndSearch(%struct.TraversalData*) #0 !pointeetys !40 {
+  %isEnd = call i1 @opaqueIsEnd()
+  ret i1 %isEnd
+}
+
+declare !pointeetys !42 i32 @_cont_HitKind(%struct.SystemData*) #0
+
+; Function Attrs: nounwind
+declare i64 @_AmdGetResumePointAddr() #1
+
+; Function Attrs: nounwind
+declare !pointeetys !43 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !44 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !43 void @_cont_AcceptHitAndEndSearch(%struct.DispatchSystemData* nocapture readnone) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !44 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !43 void @_cont_IgnoreHit(%struct.DispatchSystemData* nocapture readnone) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !44 void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* nocapture readnone) #1
+
+define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !pointeetys !45 {
+  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
+  %sys_data = insertvalue %struct.SystemData zeroinitializer, %struct.DispatchSystemData %dis_data, 0
+  %trav_data = insertvalue %struct.TraversalData zeroinitializer, %struct.SystemData %sys_data, 0
+  %addr = call i64 @_AmdGetResumePointAddr() #3
+  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 %addr, 5
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i64 4, %struct.TraversalData %trav_data2)
+  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
+  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
+  ret void
+}
+
+define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !pointeetys !46 {
+  %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, i64 poison, %struct.DispatchSystemData %dis_data)
+  store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
+  call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
+  ret void
+}
+
+define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #0 !pointeetys !47 {
+  %origTPtr = getelementptr inbounds %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0, i32 4
+  %origT = load float, float* %origTPtr, align 4
+  %isNoHit = fcmp fast uge float %t, %origT
+  br i1 %isNoHit, label %isEnd, label %callAHit
+
+callAHit:                                         ; preds = %0
+  %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, align 4
+  %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, i64 poison, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind)
+  store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data, align 4
+  call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data)
+  ret i1 true
+
+isEnd:                                            ; preds = %0
+  ; Call AcceptHitAttributes, just to simulate it
+  call void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* %data)
+  ret i1 false
+}
+
+define <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData* %data) !pointeetys !48 {
+  %resPtr.1 = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 0
+  %res.1 = load i32, i32* %resPtr.1, align 4
+  %resPtr.2 = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 1
+  %res.2 = load i32, i32* %resPtr.2, align 4
+  %resPtr.3 = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 2
+  %res.3 = load i32, i32* %resPtr.3, align 4
+  %val.0 = insertelement <3 x i32> undef, i32 %res.1, i32 0
+  %val.1 = insertelement <3 x i32> %val.0, i32 %res.2, i32 1
+  %val.2 = insertelement <3 x i32> %val.1, i32 %res.3, i32 2
+  ret <3 x i32> %val.2
+}
+
+define <3 x float> @_cont_ObjectRayOrigin3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !49 {
+  %resPtr.1 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 0
+  %res.1 = load float, float* %resPtr.1, align 4
+  %resPtr.2 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 1
+  %res.2 = load float, float* %resPtr.2, align 4
+  %resPtr.3 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 2
+  %res.3 = load float, float* %resPtr.3, align 4
+  %val.0 = insertelement <3 x float> undef, float %res.1, i32 0
+  %val.1 = insertelement <3 x float> %val.0, float %res.2, i32 1
+  %val.2 = insertelement <3 x float> %val.1, float %res.3, i32 2
+  ret <3 x float> %val.2
+}
+
+define <3 x float> @_cont_ObjectRayDirection3(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !49 {
+  %resPtr.1 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 0
+  %res.1 = load float, float* %resPtr.1, align 4
+  %resPtr.2 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 1
+  %res.2 = load float, float* %resPtr.2, align 4
+  %resPtr.3 = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 2
+  %res.3 = load float, float* %resPtr.3, align 4
+  %val.0 = insertelement <3 x float> undef, float %res.1, i32 0
+  %val.1 = insertelement <3 x float> %val.0, float %res.2, i32 1
+  %val.2 = insertelement <3 x float> %val.1, float %res.3, i32 2
+  ret <3 x float> %val.2
+}
+
+define float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) !pointeetys !51 {
+  %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 2
+  %res = load float, float* %resPtr, align 4
+  ret float %res
+}
+
+; RayGen: In this test case, we have mostly constant system data (_cont_Traceray uses zero-initialized traversal system data),
+;         undef padding for the candidate, and constant payload. The storage for committed hit attributes
+;         within the payload storage is undef as well.
+;         Note that the dispatch system data (passed in the first args) is dynamic although it preserves an
+;         argument incoming to RayGen. This is because we only allow arg preservation *within* Traversal.
+; CHECK-LABEL: [SDS] Finished analysis of function MyRayGen
+; CHECK-NEXT:  [SDS] 0         1         2         3         4     {{$}}
+; CHECK-NEXT:  [SDS] 0123456789012345678901234567890123456789012345{{$}}
+; CHECK-NEXT:  [SDS] DDDCCCCCCCCCCCCCCCDDUUUUUUUUUUUUUUUUCUUUUUUCCC{{$}}
+;                    ^^^ dynamic dispatch system data
+;                       ^^^^^^^^^^^^^^^ constant ray
+;                                      ^^ dynamic raygen.resume return addr
+;                                        ^^^^^^^^^^^^^^^^ undef candidate
+;                                                        ^      ^^^ constant payload
+;                                                         ^^^^^^ undef committed attrs
+define void @MyRayGen() #2 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
+  %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
+  %3 = alloca %struct.RayPayload, align 4
+  %4 = bitcast %struct.RayPayload* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) #1
+  %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0
+  store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !52
+  %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)
+  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 })
+  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3)
+  %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !52
+  %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
+  %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
+  %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2)
+  %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 })
+  %13 = extractelement <4 x float> %8, i64 0
+  %14 = extractelement <4 x float> %8, i64 1
+  %15 = extractelement <4 x float> %8, i64 2
+  %16 = extractelement <4 x float> %8, i64 3
+  call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) #1
+  ret void
+}
+
+; Non-recursive CHS: No calls to Traversal, so no state to report.
+; CHECK-LABEL: [SDS] Finished analysis of function MyClosestHitShader
+; CHECK-NEXT:  [SDS] <empty>
+define void @MyClosestHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #2 !pointeetys !55 {
+  %1 = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0
+  %2 = load <2 x float>, <2 x float>* %1, align 4
+  %3 = extractelement <2 x float> %2, i32 0
+  %4 = fsub fast float 1.000000e+00, %3
+  %5 = extractelement <2 x float> %2, i32 1
+  %6 = fsub fast float %4, %5
+  %7 = insertelement <4 x float> undef, float %6, i64 0
+  %8 = insertelement <4 x float> %7, float %3, i64 1
+  %9 = insertelement <4 x float> %8, float %5, i64 2
+  %10 = insertelement <4 x float> %9, float 1.000000e+00, i64 3
+  %11 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
+  store <4 x float> %10, <4 x float>* %11, align 4
+  ret void
+}
+
+; AnyHit: Payload and committed hit attrs are preserved.
+; CHECK-LABEL: [SDS] Finished analysis of function MyAnyHitShader
+; CHECK-NEXT:  [SDS] 0         1         2         3         4     {{$}}
+; CHECK-NEXT:  [SDS] 0123456789012345678901234567890123456789012345{{$}}
+; CHECK-NEXT:  [SDS] DDDDDDDDDDDDDDDDDDDDDDDDDDDDUUUUUUUUPPPPPPPPPP{{$}}
+define void @MyAnyHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readnone %attr) #2 !pointeetys !55 {
+  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
+  %2 = load <4 x float>, <4 x float>* %1, align 4
+  %3 = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0)
+  %4 = call float @dx.op.objectRayDirection.f32(i32 150, i8 0)
+  %5 = call float @dx.op.rayTCurrent.f32(i32 154)
+  %6 = fmul fast float %5, %4
+  %7 = fadd fast float %6, %3
+  %8 = fcmp fast ogt float %7, 0.000000e+00
+  %9 = fcmp fast ogt float %7, 1.000000e+00
+  %10 = fcmp fast ogt float %7, -1.000000e+00
+  br i1 %8, label %11, label %14
+
+11:                                               ; preds = %0
+; acceptHitAndEndSearch
+  store <4 x float> %2, <4 x float>* %1, align 4
+  br i1 %9, label %12, label %13
+
+12:                                               ; preds = %11
+; acceptHitAndEndSearch with unreachable
+  call void @dx.op.acceptHitAndEndSearch(i32 156)
+  unreachable
+
+13:                                               ; preds = %11
+; acceptHitAndEndSearch with ret void
+  call void @dx.op.acceptHitAndEndSearch(i32 156)
+  ret void
+
+14:                                               ; preds = %0
+; IgnoreHit or AcceptHit
+  br i1 %10, label %15, label %18
+
+15:                                               ; preds = %14
+; IgnoreHit
+  br i1 %9, label %16, label %17
+
+16:                                               ; preds = %15
+; IgnoreHit with unreachable
+  call void @dx.op.ignoreHit(i32 155)
+  unreachable
+
+17:                                               ; preds = %15
+; IgnoreHit with ret void (as emitted by debug mode dxc)
+  call void @dx.op.ignoreHit(i32 155)
+  ret void
+
+18:                                               ; preds = %14
+; AcceptHit
+  store <4 x float> %2, <4 x float>* %1, align 4
+  ret void
+}
+
+; Intersection: The payload is preserved, even across ReportHit calls.
+; Six Argument slots unused by the small hit attributes are undef.
+; CHECK-LABEL: [SDS] Finished analysis of function MyIntersectionShader
+; CHECK-NEXT:  [SDS] 0         1         2         3         4         5         6     {{$}}
+; CHECK-NEXT:  [SDS] 012345678901234567890123456789012345678901234567890123456789012345{{$}}
+; CHECK-NEXT:  [SDS] DDDPPPPPPPPPPPPPPPPPPPPPPPPPDCUUUUUUPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP{{$}}
+define void @MyIntersectionShader() #2 {
+  %1 = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
+  %2 = call float @dx.op.rayTCurrent.f32(i32 154)
+  %3 = bitcast %struct.BuiltInTriangleIntersectionAttributes* %1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %3) #1
+  %4 = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float %2, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %1)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %3) #1
+  ret void
+}
+
+; Intersection with ReportHit in a loop: The analysis doesn't understand that the payload is preserved,
+; because we don't repeatedly propagate through loops. This could be improved in ValueOriginTracking.
+; CHECK-LABEL: [SDS] Finished analysis of function MyIntersectionShaderLoop
+; CHECK-NEXT:  [SDS] 0         1         2         3         4         5         6     {{$}}
+; CHECK-NEXT:  [SDS] 012345678901234567890123456789012345678901234567890123456789012345{{$}}
+; CHECK-NEXT:  [SDS] DDDDDDDDDDDDDDDDDDDDDDDDDDDDDCUUUUUUDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD{{$}}
+define void @MyIntersectionShaderLoop() #2 {
+  %1 = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
+  %2 = call float @dx.op.rayTCurrent.f32(i32 154)
+  %3 = bitcast %struct.BuiltInTriangleIntersectionAttributes* %1 to i8*
+  br label %loop
+loop:
+  %4 = call i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32 158, float %2, i32 0, %struct.BuiltInTriangleIntersectionAttributes* nonnull %1)
+  br i1 %4, label %loop, label %exit
+exit:
+  ret void
+}
+
+; Non-recursive Miss: No calls to Traversal, so no state to report.
+; CHECK-LABEL: [SDS] Finished analysis of function MyMissShader
+; CHECK-NEXT:  [SDS] <empty>
+define void @MyMissShader(%struct.RayPayload* noalias nocapture %payload) #2 !pointeetys !58 {
+  %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
+  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, <4 x float>* %1, align 4
+  ret void
+}
+
+; Recursive Miss: The passes through the incoming payload to traceRay, but it's treated as dynamic because miss is outside of Traversal.
+; CHECK-LABEL: [SDS] Finished analysis of function MyMissShaderRecursive
+; CHECK-NEXT:  [SDS] 0         1         2         3         4     {{$}}
+; CHECK-NEXT:  [SDS] 0123456789012345678901234567890123456789012345{{$}}
+; CHECK-NEXT:  [SDS] DDDCCCCCCCCCCCCCCCDDUUUUUUUUUUUUUUUUDDDDDDDDDD{{$}}
+define void @MyMissShaderRecursive(%struct.RayPayload* noalias nocapture %payload) #2 !pointeetys !58 {
+  %tmp1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
+  %tmp6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %tmp1)
+  %tmp7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 16, i32 0 })
+  call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %tmp7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %payload)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare !pointeetys !59 void @dx.op.traceRay.struct.RayPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.RayPayload*) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.textureStore.f32(i32, %dx.types.Handle, i32, i32, i32, float, float, float, float, i8) #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @dx.op.dispatchRaysIndex.i32(i32, i8) #3
+
+; Function Attrs: nounwind memory(none)
+declare float @dx.op.objectRayDirection.f32(i32, i8) #3
+
+; Function Attrs: nounwind memory(none)
+declare float @dx.op.objectRayOrigin.f32(i32, i8) #3
+
+; Function Attrs: nounwind memory(read)
+declare float @dx.op.rayTCurrent.f32(i32) #4
+
+declare void @dx.op.acceptHitAndEndSearch(i32) #0
+
+declare void @dx.op.ignoreHit(i32) #0
+
+; Function Attrs: nounwind
+declare !pointeetys !60 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes*) #1
+
+; Function Attrs: nounwind
+declare !pointeetys !61 i1 @dx.op.reportHit.struct.BuiltInTriangleIntersectionAttributes2(i32, float, i32, %struct.BuiltInTriangleIntersectionAttributes2*) #1
+
+; Function Attrs: nounwind memory(none)
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
+
+; Function Attrs: nounwind memory(read)
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !63 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare !pointeetys !63 void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nounwind memory(read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!llvm.ident = !{!0}
+!dx.version = !{!1}
+!dx.valver = !{!1}
+!dx.shaderModel = !{!2}
+!dx.resources = !{!3}
+!dx.typeAnnotations = !{!10}
+!dx.entryPoints = !{!18, !20, !23, !25, !27, !29, !31, !65}
+
+!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
+!1 = !{i32 1, i32 6}
+!2 = !{!"lib", i32 6, i32 6}
+!3 = !{!4, !7, null, null}
+!4 = !{!5}
+!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
+!6 = !{i32 0, i32 4}
+!7 = !{!8}
+!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
+!9 = !{i32 0, i32 9}
+!10 = !{i32 1, void ()* @MyRayGen, !11, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyClosestHitShader, !14, void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyAnyHitShader, !14, void ()* @MyIntersectionShader, !11, void ()* @MyIntersectionShaderLoop, !11, void (%struct.RayPayload*)* @MyMissShader, !17}
+!11 = !{!12}
+!12 = !{i32 1, !13, !13}
+!13 = !{}
+!14 = !{!12, !15, !16}
+!15 = !{i32 2, !13, !13}
+!16 = !{i32 0, !13, !13}
+!17 = !{!12, !15}
+!18 = !{null, !"", null, !3, !19}
+!19 = !{i32 0, i64 65536}
+!20 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyAnyHitShader, !"MyAnyHitShader", null, null, !21}
+!21 = !{i32 8, i32 9, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
+!22 = !{i32 0}
+!23 = !{void (%struct.RayPayload*, %struct.BuiltInTriangleIntersectionAttributes*)* @MyClosestHitShader, !"MyClosestHitShader", null, null, !24}
+!24 = !{i32 8, i32 10, i32 6, i32 16, i32 7, i32 8, i32 5, !22}
+!25 = !{void ()* @MyIntersectionShader, !"MyIntersectionShader", null, null, !26}
+!26 = !{i32 8, i32 8, i32 5, !22}
+!27 = !{void (%struct.RayPayload*)* @MyMissShader, !"MyMissShader", null, null, !28}
+!28 = !{i32 8, i32 11, i32 6, i32 16, i32 5, !22}
+!29 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !30}
+!30 = !{i32 8, i32 7, i32 5, !22}
+!31 = !{void ()* @MyIntersectionShaderLoop, !"MyIntersectionShaderLoop", null, null, !26}
+!32 = !{%struct.AnyHitTraversalData poison}
+!33 = !{i32 0, %struct.AnyHitTraversalData poison}
+!34 = !{%struct.SystemData poison}
+!35 = !{i32 0, %struct.SystemData poison}
+!36 = !{%struct.SystemData poison}
+!37 = !{%struct.SystemData poison}
+!38 = !{%struct.DispatchSystemData poison}
+!39 = !{i32 0, %struct.DispatchSystemData poison}
+!40 = !{%struct.TraversalData poison}
+!41 = !{i32 0, %struct.TraversalData poison}
+!42 = !{%struct.SystemData poison}
+!43 = !{%struct.DispatchSystemData poison}
+!44 = !{%struct.AnyHitTraversalData poison}
+!45 = !{%struct.DispatchSystemData poison}
+!46 = !{%struct.DispatchSystemData poison}
+!47 = !{%struct.AnyHitTraversalData poison}
+!48 = !{%struct.DispatchSystemData poison}
+!49 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
+!50 = !{i32 0, %struct.HitData poison}
+!51 = !{null, %struct.DispatchSystemData poison, %struct.HitData poison}
+!52 = !{!53, !53, i64 0}
+!53 = !{!"omnipotent char", !54, i64 0}
+!54 = !{!"Simple C/C++ TBAA"}
+!55 = !{null, %struct.RayPayload poison, %struct.BuiltInTriangleIntersectionAttributes poison}
+!56 = !{i32 0, %struct.RayPayload poison}
+!57 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes poison}
+!58 = !{%struct.RayPayload poison}
+!59 = !{%struct.RayPayload poison}
+!60 = !{%struct.BuiltInTriangleIntersectionAttributes poison}
+!61 = !{%struct.BuiltInTriangleIntersectionAttributes2 poison}
+!62 = !{i32 0, %struct.BuiltInTriangleIntersectionAttributes2 poison}
+!63 = !{i8 poison}
+!64 = !{i32 0, i8 poison}
+!65 = !{void (%struct.RayPayload*)* @MyMissShaderRecursive, !"MyMissShaderRecursive", null, null, !28}
diff --git a/llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll b/llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll
new file mode 100644
index 0000000000..4cea4eaad6
--- /dev/null
+++ b/llvmraytracing/test/dx/specialize-driver-shaders/specialization.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+;
+; Traversal specialization tests. The Traversal functions in this module always pass through args,
+; and the module contains metadata with argument slot infos.
+; Value specialization has its own lit tests, so we focus here
+; on everything that is implemented in SpecializeDriverShadersPass, particularly regarding the argument slot handling.
+;
+; RUN: opt --verify-each -passes='specialize-driver-shaders' -S %s | FileCheck %s
+;
+; Intentionally align i64 to 64 bits so we can test specializations within types with padding.
+; i16 is aligned to 16 bits so we can test smaller-than-dword scalars.
+; f32 is aligned to 16 bits to test misaligned dword-sized scalars.
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:16-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+; Ignored prefix args: shaderAddr, levels, state, returnAddr, shaderRecIdx
+declare void @lgc.cps.jump(...)
+
+define void @SimpleArray({}, i32 %ret.addr, i32, [4 x i32] %args) !lgc.rt.shaderstage !{i32 6} {
+; CHECK-LABEL: define void @SimpleArray(
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[RET_ADDR:%.*]], i32 [[TMP1:%.*]], [4 x i32] [[ARGS:%.*]]) !lgc.rt.shaderstage [[META2:![0-9]+]] {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue [4 x i32] [[ARGS]], i32 42, 1
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i32 poison
+; CHECK-NEXT:    [[ARGS_SPECIALIZED1:%.*]] = insertvalue [4 x i32] [[ARGS_SPECIALIZED]], i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i32 poison
+; CHECK-NEXT:    [[ARGS_SPECIALIZED2:%.*]] = insertvalue [4 x i32] [[ARGS_SPECIALIZED1]], i32 [[TMP4]], 3
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, [4 x i32] [[ARGS_SPECIALIZED2]])
+; CHECK-NEXT:    unreachable
+;
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, [4 x i32] %args)
+  unreachable
+}
+
+define void @SimpleScalars({}, i32 %ret.addr, i32, i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3) !lgc.rt.shaderstage !{i32 6} {
+; CHECK-LABEL: define void @SimpleScalars(
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[RET_ADDR:%.*]], i32 [[TMP1:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]]) !lgc.rt.shaderstage [[META2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i32 poison
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i32 poison
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 [[ARG0]], i32 42, i32 [[TMP3]], i32 [[TMP4]])
+; CHECK-NEXT:    unreachable
+;
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3)
+  unreachable
+}
+
+define void @I16s({}, i32 %ret.addr, i32, i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3) !lgc.rt.shaderstage !{i32 6} {
+; CHECK-LABEL: define void @I16s(
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[RET_ADDR:%.*]], i32 [[TMP1:%.*]], i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], i16 [[ARG2:%.*]], i16 [[ARG3:%.*]]) !lgc.rt.shaderstage [[META2]] {
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i16 [[ARG0]], i16 [[ARG1]], i16 [[ARG2]], i16 [[ARG3]])
+; CHECK-NEXT:    unreachable
+;
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3)
+  unreachable
+}
+
+; Test that even if specialization of i16 arguments is ignored, we still specialize i32s.
+define void @MixedI16I32s({}, i32 %ret.addr, i32, i16 %arg0, i32 %arg1, i16 %arg2, i32 %arg3) !lgc.rt.shaderstage !{i32 6} {
+; CHECK-LABEL: define void @MixedI16I32s(
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[RET_ADDR:%.*]], i32 [[TMP1:%.*]], i16 [[ARG0:%.*]], i32 [[ARG1:%.*]], i16 [[ARG2:%.*]], i32 [[ARG3:%.*]]) !lgc.rt.shaderstage [[META2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i32 poison
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i16 [[ARG0]], i32 42, i16 [[ARG2]], i32 [[TMP3]])
+; CHECK-NEXT:    unreachable
+;
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, i16 %arg0, i32 %arg1, i16 %arg2, i32 %arg3)
+  unreachable
+}
+
+; Test that specializing an arg slot that occupies a full misaligned dword in the argument isn't supported
+; In this test, the first contained float scalar is specialized, because it is dword-aligned,
+; but the second isn't, because it is not aligned. This is because i16 and float use 16-bit alignment in this test.
+define void @MisalignedDwords({}, i32 %ret.addr, i32, { i32, float, i16, float, i32 } %args) !lgc.rt.shaderstage !{i32 6} {
+; CHECK-LABEL: define void @MisalignedDwords(
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[RET_ADDR:%.*]], i32 [[TMP1:%.*]], { i32, float, i16, float, i32 } [[ARGS:%.*]]) !lgc.rt.shaderstage [[META2]] {
+; CHECK-NEXT:    [[ARGS_SPECIALIZED:%.*]] = insertvalue { i32, float, i16, float, i32 } [[ARGS]], float 0x36F5000000000000, 1
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, { i32, float, i16, float, i32 } [[ARGS_SPECIALIZED]])
+; CHECK-NEXT:    unreachable
+;
+  call void (...) @lgc.cps.jump(i32 poison, i32 poison, {} poison, i32 poison, i32 poison, { i32, float, i16, float, i32 } %args)
+  unreachable
+}
+
+!lgc.cps.module = !{}
+!lgc.rt.specialize.driver.shaders.state = !{!0}
+; Disable analysis, so traversal variants that we can't handle don't affect other functions in this test.
+!lgc.rt.specialize.driver.shaders.opts = !{!1}
+
+; Numerical status values:
+;
+;    Status        | Value
+;    =====================
+;    Dynamic       |     0
+;    Constant      |     1
+;    UndefOrPoison |     2
+;    Preserve      |     3
+;
+
+!0 = !{
+; Status |        [Constant] | Arg slot idx
+  i32 0,   i32           0, ;            0
+  i32 1,   i32          42, ;            1
+  i32 2,   i32           0, ;            2
+  i32 3,   i32           0, ;            3
+  i32 0,   i32           0  ;            4
+}
+!1 = !{i32 0, i32 1}
diff --git a/llvmraytracing/test/dx/stats-report-sizes.ll b/llvmraytracing/test/dx/stats-report-sizes.ll
index 4f36c59d7a..58348b9fb1 100644
--- a/llvmraytracing/test/dx/stats-report-sizes.ll
+++ b/llvmraytracing/test/dx/stats-report-sizes.ll
@@ -15,10 +15,11 @@ declare void @lgc.cps.jump(...)
 ; REPORT-CONT-SIZES: Continuation state size of "RayGen" (raygeneration): 108 bytes
 ; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "RayGen" (raygeneration): 7 and 6 dwords
 define void @RayGen(i64 %dummyRetAddr, %struct.DispatchSystemData %0) !continuation.entry !0 !continuation !3 !continuation.state !5 !continuation.registercount !7 !lgc.rt.shaderstage !12 {
-  %csp = alloca i32, align 4
+  %ptr = alloca i32, align 4
   %cspInit = call i32 @continuation.initialContinuationStackPtr()
-  store i32 %cspInit, i32* %csp
-  call void (...) @lgc.cps.jump(i64 2, i32 poison, {} poison, i64 poison), !continuation.registercount !6
+  store i32 %cspInit, i32* %ptr
+  %csp = load i32, ptr %ptr, align 4
+  call void (...) @lgc.cps.jump(i64 2, i32 poison, {} poison, i32 %csp, i64 poison), !continuation.registercount !6
   ret void
 }
 
@@ -32,7 +33,7 @@ define void @RayGen.resume.0(i64 %0, { %struct.DispatchSystemData } %1) !continu
 ; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "CHS" (closesthit): 8 and 9 dwords
 ; REPORT-SYSTEM-DATA-SIZES-DAG: Incoming system data of "CHS" (closesthit) is "struct.CHSSystemData", size: 400 bytes
 define void @CHS(i64 %returnAddr, %struct.CHSSystemData %0) !continuation !14 !continuation.registercount !8 !lgc.rt.shaderstage !13 {
-  call void ( ...) @lgc.cps.jump(i64 2, i32 poison, {} poison, i64 poison), !continuation.registercount !9
+  call void ( ...) @lgc.cps.jump(i64 2, i32 poison, {} poison, i32 poison, i64 poison), !continuation.registercount !9
   ret void
 }
 
diff --git a/llvmraytracing/test/dx/traceray.ll b/llvmraytracing/test/dx/traceray.ll
index b6b73db5bb..b6e6a1ddee 100644
--- a/llvmraytracing/test/dx/traceray.ll
+++ b/llvmraytracing/test/dx/traceray.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: grep -v lgc.cps.module %s | grep -v SKIP_GLOBAL_ADDRSPACE | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: grep -v lgc.cps.module %s | grep -v SKIP_GLOBAL_ADDRSPACE | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-GLOBAL %s
+; RUN: grep -v lgc.cps.module %s | grep -v SKIP_GLOBAL_ADDRSPACE | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-GLOBAL %s
 ; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
 ; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-CPS %s
 ; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-CPS %s
@@ -111,7 +111,7 @@ define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !poi
 define void @_cont_KernelEntry() #0 !lgc.rt.shaderstage !69 {
   %cspInit = ptrtoint ptr @debug_global to i32
   call void @_AmdContStackSetPtr(i32 %cspInit)
-  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, %struct.DispatchSystemData poison)
+  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 undef, %struct.DispatchSystemData poison)
   ret void
 }
 
@@ -423,7 +423,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_AmdContStackSetPtr(i32 [[CSPINIT]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ;
@@ -478,11 +478,11 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP18]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META34:![0-9]+]], !waitmask [[META41:![0-9]+]], !continuation.returnedRegistercount [[META34]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } @await(ptr [[TMP42]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i64 4, i32 8, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META34:![0-9]+]], !waitmask [[META13]], !continuation.returnedRegistercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP43]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[TMP24]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = freeze [[STRUCT_RAYPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP40]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
@@ -508,8 +508,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP42]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2
@@ -521,7 +521,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META43:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META42:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
@@ -585,17 +585,17 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP44]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META45:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META44:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_TRAVERSALDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
@@ -687,7 +687,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP58]], ptr [[ADDR_I1]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       59:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP25]], ptr [[TMP24]], align 4
@@ -719,12 +719,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP79]], ptr [[ADDR_I2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META46:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -747,8 +747,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [6 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } @await.1(ptr [[TMP13]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i64 3, i32 16, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], [6 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP19]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP25]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP19]], 0
@@ -772,21 +771,21 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       _cont_ReportHit.exit:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I]], label [[TMP20:%.*]], label [[TMP22:%.*]]
-; LOWERRAYTRACINGPIPELINE:       22:
+; LOWERRAYTRACINGPIPELINE:       21:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       25:
+; LOWERRAYTRACINGPIPELINE:       24:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShaderLargeAttrs(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46]] !continuation.registercount [[META33]] !continuation [[META48:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !continuation.registercount [[META33]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4
@@ -823,8 +822,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]], [1 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } @await.2(ptr [[TMP13]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i64 3, i32 16, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]], [1 x i32] poison, [30 x i32] [[TMP9]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP34]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [30 x i32] [[TMP35]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP34]], 0
@@ -867,21 +865,21 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       _cont_ReportHit.exit:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I]], label [[TMP30:%.*]], label [[TMP32:%.*]]
-; LOWERRAYTRACINGPIPELINE:       36:
+; LOWERRAYTRACINGPIPELINE:       35:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE:       39:
+; LOWERRAYTRACINGPIPELINE:       38:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META49:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META50:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META48:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META49:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
@@ -924,7 +922,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP27]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
@@ -954,7 +952,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @_cont_KernelEntry(
-; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -978,7 +976,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META38:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1018,7 +1016,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META38]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META37]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -1036,8 +1034,10 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = extractvalue [10 x i32] [[TMP18]], 7
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = extractvalue [10 x i32] [[TMP18]], 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = extractvalue [10 x i32] [[TMP18]], 9
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP21:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP21]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP12]] to float
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTFCA_0_EXTRACT]], float [[TMP2]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_0_VEC_INSERT]], float [[TMP4]], i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP5]] to float
@@ -1052,20 +1052,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP10]], i8 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP11]], i8 1
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP21:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP21]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP22:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP22]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 3
 ; DXILCONTPOSTPROCESS-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP13]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP14]], float [[TMP15]], float [[TMP16]], float [[TMP17]], i8 15)
 ; DXILCONTPOSTPROCESS-NEXT:    ret void
-; DXILCONTPOSTPROCESS:       entryresume.0.split:
-; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1133,7 +1131,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -1367,7 +1365,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META45:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1572,7 +1570,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META44]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META43]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1720,7 +1718,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !continuation [[META46:![0-9]+]] !continuation.stacksize [[META45]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1927,7 +1925,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META46]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META45]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2075,7 +2073,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2150,7 +2148,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @_cont_KernelEntry(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META36:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
@@ -2176,7 +2174,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META38:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2218,7 +2216,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META38]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation [[META37]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -2238,8 +2236,10 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP5:%.*]] = extractvalue [10 x i32] [[TMP20]], 7
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = extractvalue [10 x i32] [[TMP20]], 8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP9:%.*]] = extractvalue [10 x i32] [[TMP20]], 9
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP23:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP23]], 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i32 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTFCA_0_EXTRACT]], float [[TMP4]], i32 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP5]] to float
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_0_VEC_INSERT]], float [[TMP6]], i32 1
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP7]] to float
@@ -2254,20 +2254,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP12]], i8 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP13]], i8 1
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP23:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP23]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP24:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP24]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 3
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP15]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP16]], float [[TMP17]], float [[TMP18]], float [[TMP19]], i8 15)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    ret void
-; DXILCONTPOSTPROCESS-GLOBAL:       entryresume.0.split:
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2337,7 +2335,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -2573,7 +2571,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META45:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2779,7 +2777,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META44]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META43]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2927,7 +2925,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !continuation [[META46:![0-9]+]] !continuation.stacksize [[META45]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3135,7 +3133,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META43]] !continuation [[META46]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation [[META45]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3283,7 +3281,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3363,7 +3361,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_AmdContStackSetPtr(i32 [[CSPINIT]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison)
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    ret void
 ;
 ;
@@ -3418,10 +3416,11 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP20]], ptr [[TMP18]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP21]]), !waitmask [[META42:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP21]]), !waitmask [[META13:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP22]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = freeze [[STRUCT_RAYPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP39]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
@@ -3447,8 +3446,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP38]], i8 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP45]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP36]], i64 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP36]], i64 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP36]], i64 2
@@ -3460,7 +3459,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
@@ -3523,17 +3522,17 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP40]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP41]], [21 x i32] poison, [10 x i32] [[TMP42]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP41]], [21 x i32] poison, [10 x i32] [[TMP42]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !lgc.cps [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_TRAVERSALDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
@@ -3624,7 +3623,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP52]], ptr [[ADDR_I1]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP54]], [8 x i32] poison, [10 x i32] [[TMP55]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP54]], [8 x i32] poison, [10 x i32] [[TMP55]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       56:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
@@ -3656,12 +3655,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP72]], ptr [[ADDR_I2]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP74]], [8 x i32] poison, [10 x i32] [[TMP75]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP74]], [8 x i32] poison, [10 x i32] [[TMP75]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META48:![0-9]+]] !continuation [[META49:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -3711,18 +3710,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       20:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP22]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP22]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       23:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META48]] !continuation [[META50:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META47]] !continuation [[META49:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4
@@ -3805,18 +3804,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS:       34:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP35]], [8 x i32] poison, [30 x i32] [[TMP36]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP35]], [8 x i32] poison, [30 x i32] [[TMP36]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       37:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46]] !lgc.cps [[META44]] !continuation [[META51:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !lgc.cps [[META43]] !continuation [[META50:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
@@ -3858,7 +3857,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [21 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META34]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [21 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META34]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -3891,7 +3890,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META36:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; CLEANUP-CPS-NEXT:    call void @_AmdContStackSetPtr(i32 [[CSPINIT]])
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison)
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison)
 ; CLEANUP-CPS-NEXT:    ret void
 ;
 ;
@@ -3910,7 +3909,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyRayGen(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -3942,15 +3941,16 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP8]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !waitmask [[META39:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i32 poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !waitmask [[META13:![0-9]+]], !continuation.returnedRegistercount [[META34:![0-9]+]], !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META37]] !continuation [[META38]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META37]] !continuation [[META38]] !continuation.registercount [[META34]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
-; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, align 8
-; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], 0
+; CLEANUP-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP12]], ptr [[TMP4]], align 4
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], 2
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 1
@@ -3962,8 +3962,10 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP5]], 9
+; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP16]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float
-; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0
+; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTFCA_0_EXTRACT1]], float [[TMP6]], i32 0
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_0_VEC_INSERT]], float [[TMP7]], i32 1
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = bitcast i32 [[DOTFCA_8_EXTRACT]] to float
@@ -3974,14 +3976,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP10]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP12]])
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP4]])
 ; CLEANUP-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP13]], i8 0
-; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP14]])
+; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP4]])
 ; CLEANUP-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP15]], i8 1
-; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
-; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP16]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; CLEANUP-CPS-NEXT:    [[TMP22:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
+; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP22]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; CLEANUP-CPS-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; CLEANUP-CPS-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; CLEANUP-CPS-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
@@ -3991,7 +3991,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyClosestHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
@@ -4051,12 +4051,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP17]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP18]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP19]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyAnyHitShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; CLEANUP-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -4220,7 +4220,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP20]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT73]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT73]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       28:
 ; CLEANUP-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
@@ -4280,12 +4280,13 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT49:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT46]], i32 [[TMP30]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT52:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT49]], i32 [[TMP31]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT55:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT52]], i32 [[TMP32]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT99]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT55]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT99]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT55]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]]
+; CLEANUP-CPS-SAME: !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META46:![0-9]+]] !continuation.state [[META46]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4376,7 +4377,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i32 poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [6 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       accepthit.i:
 ; CLEANUP-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -4431,7 +4432,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       6:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -4473,12 +4474,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META46]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META45]] !continuation.registercount [[META33]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
@@ -4566,7 +4567,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4610,12 +4611,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META47:![0-9]+]] !continuation.stacksize [[META46]] !continuation.state [[META46]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADERLARGEATTRS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -4712,7 +4713,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShaderLargeAttrs.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i32 poison, i64 [[TMP0]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [1 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META33]], !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       accepthit.i:
 ; CLEANUP-CPS-NEXT:    [[TMP1:%.*]] = bitcast i32 100 to float
@@ -4763,7 +4764,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       4:
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -4805,12 +4806,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META45]] !continuation [[META47]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META47]] !continuation.registercount [[META33]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CLEANUP-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
@@ -4898,7 +4899,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD6]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD6]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT313]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ; CLEANUP-CPS:       8:
 ; CLEANUP-CPS-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADERLARGEATTRS_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -4942,12 +4943,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT276]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META33]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @MyMissShader(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META41]] !continuation [[META48:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META48:![0-9]+]] !continuation.state [[META22]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
@@ -4989,7 +4990,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP5]], 7
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META34]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -5085,10 +5086,11 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
 ; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META37]] !continuation [[META38]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, align 8
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], 0
+; DXILCONTPOSTPROCESS-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP4]], ptr [[TMP13]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP3]], 2
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP6]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP6]], 1
@@ -5100,8 +5102,10 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP6]], 7
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP6]], 8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [10 x i32] [[TMP6]], 9
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP23:%.*]] = freeze [[STRUCT_RAYPAYLOAD:%.*]] poison
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_RAYPAYLOAD]] [[TMP23]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = bitcast i32 [[DOTFCA_0_EXTRACT]] to float
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTFCA_0_EXTRACT1]], float [[TMP7]], i32 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = bitcast i32 [[DOTFCA_7_EXTRACT]] to float
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_0_VEC_INSERT]], float [[TMP8]], i32 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = bitcast i32 [[DOTFCA_8_EXTRACT]] to float
@@ -5112,14 +5116,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT21:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP11]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP13]])
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP14]], i8 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP15]])
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP13]])
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP16]], i8 1
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP12]])
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP18:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP17]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP12]])
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP18:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP15]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
@@ -5433,7 +5435,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META46:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5790,7 +5792,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META46:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META47:![0-9]+]] !continuation.stacksize [[META46]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5999,7 +6001,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META46]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META44]] !continuation [[META47]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -6149,7 +6151,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META47:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META48:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
diff --git a/llvmraytracing/test/dx/traversal-empty-payload.ll b/llvmraytracing/test/dx/traversal-empty-payload.ll
index 86118f8d91..12c2dddd9b 100644
--- a/llvmraytracing/test/dx/traversal-empty-payload.ll
+++ b/llvmraytracing/test/dx/traversal-empty-payload.ll
@@ -13,6 +13,7 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 !continuation.maxUsedPayloadRegisterCount = !{!8} ; EMPTY_PAYLOAD
 
 declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
+declare !pointeetys !4 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
@@ -33,12 +34,12 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage
 6:                                                ; preds = %0
   %7 = load %struct.SystemData, ptr %5, align 4
   %8 = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
-  call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i64 %8, %struct.SystemData %7), !waitmask !9
+  call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i32 poison, i64 %8, %struct.SystemData %7), !waitmask !9
   unreachable
 
 9:                                                ; preds = %0
   %10 = load %struct.SystemData, ptr %5, align 4
-  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData %10), !waitmask !9
+  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 poison, %struct.SystemData %10), !waitmask !9
   unreachable
 }
 
@@ -71,16 +72,16 @@ attributes #2 = { nounwind }
 ; EMPTYPAYLOAD:       7:
 ; EMPTYPAYLOAD-NEXT:    [[TMP8:%.*]] = load [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP6]], align 4
 ; EMPTYPAYLOAD-NEXT:    [[TMP9:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
-; EMPTYPAYLOAD-NEXT:    call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i64 [[TMP9]], [[STRUCT_SYSTEMDATA]] [[TMP8]]), !waitmask [[META5:![0-9]+]], !continuation.registercount [[META0]]
+; EMPTYPAYLOAD-NEXT:    call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i32 poison, i64 [[TMP9]], [[STRUCT_SYSTEMDATA]] [[TMP8]]), !waitmask [[META5:![0-9]+]], !continuation.registercount [[META0]]
 ; EMPTYPAYLOAD-NEXT:    unreachable
 ; EMPTYPAYLOAD:       10:
 ; EMPTYPAYLOAD-NEXT:    [[TMP13:%.*]] = load [[STRUCT_SYSTEMDATA]], ptr [[TMP6]], align 4
-; EMPTYPAYLOAD-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA]] [[TMP13]]), !waitmask [[META5]], !continuation.registercount [[META0]]
+; EMPTYPAYLOAD-NEXT:    call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_SYSTEMDATA]] [[TMP13]]), !waitmask [[META5]], !continuation.registercount [[META0]]
 ; EMPTYPAYLOAD-NEXT:    unreachable
 ;
 ;
 ; EMPTYPAYLOAD-ALL-LABEL: define void @_cont_Traversal(
-; EMPTYPAYLOAD-ALL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [0 x i32] [[PADDING:%.*]], [0 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META3:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation [[META4:![0-9]+]] {
+; EMPTYPAYLOAD-ALL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [0 x i32] [[PADDING:%.*]], [0 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META3:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation [[META4:![0-9]+]] !continuation.state [[META0]] {
 ; EMPTYPAYLOAD-ALL-NEXT:  AllocaSpillBB:
 ; EMPTYPAYLOAD-ALL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; EMPTYPAYLOAD-ALL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
diff --git a/llvmraytracing/test/dx/traversal-passthrough-payload.ll b/llvmraytracing/test/dx/traversal-passthrough-payload.ll
index 9224962e8c..a3a2182a7b 100644
--- a/llvmraytracing/test/dx/traversal-passthrough-payload.ll
+++ b/llvmraytracing/test/dx/traversal-passthrough-payload.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: grep -v PRESERVED_REGCOUNT %s | opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck --check-prefix=MAXPAYLOADSIZE %s
-; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck --check-prefix=PRESERVEDPAYLOADSIZE %s
+; RUN: grep -v PRESERVED_REGCOUNT %s | opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck --check-prefix=MAXPAYLOADSIZE %s
+; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck --check-prefix=PRESERVEDPAYLOADSIZE %s
 
 ; Test that we pass either the maximum or the computed, preserved payload size through _cont_Traversal.
 
@@ -13,6 +13,7 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 !continuation.maxUsedPayloadRegisterCount = !{!8} ; PRESERVED_REGCOUNT
 
 declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
+declare !pointeetys !4 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
@@ -33,12 +34,12 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage
 6:                                                ; preds = %0
   %7 = load %struct.SystemData, ptr %5, align 4
   %8 = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
-  call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i64 %8, %struct.SystemData %7), !waitmask !9
+  call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i32 poison, i64 %8, %struct.SystemData %7), !waitmask !9
   unreachable
 
 9:                                                ; preds = %0
   %10 = load %struct.SystemData, ptr %5, align 4
-  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData %10), !waitmask !9
+  call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i32 poison, i64 poison, %struct.SystemData %10), !waitmask !9
   unreachable
 }
 
diff --git a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll b/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
index 65e1950b5e..e8316db38f 100644
--- a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
+++ b/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
@@ -382,11 +382,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[TMP1]] [[TRAV_DATA2_I]], [16 x i32] poison, [10 x i32] [[TMP21]]), !continuation.registercount [[META18:![0-9]+]], !continuation.returnedRegistercount [[META18]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = call { [[TMP0]], [33 x i32], [10 x i32] } @await(ptr [[TMP28]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = call { [[TMP0]], [33 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_sa33i32a10i32s(i64 4, i32 8, i64 poison, [[TMP1]] [[TRAV_DATA2_I]], [16 x i32] poison, [10 x i32] [[TMP21]]), !continuation.registercount [[META18:![0-9]+]], !continuation.returnedRegistercount [[META18]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = extractvalue { [[TMP0]], [33 x i32], [10 x i32] } [[TMP35]], 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [10 x i32] [[TMP24]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = freeze [[STRUCT_RAYPAYLOAD]] poison
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP38]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP23]], align 4
@@ -489,6 +489,6 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[TMP2]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load [[TMP0]], ptr [[TMP46]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[TMP0]] [[TMP47]], [33 x i32] poison, [10 x i32] [[TMP49]]), !continuation.registercount [[META18]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[TMP0]] [[TMP47]], [33 x i32] poison, [10 x i32] [[TMP49]]), !continuation.registercount [[META18]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/wrong-system-data.ll b/llvmraytracing/test/dx/wrong-system-data.ll
index f988c02ec9..ef077cbed7 100644
--- a/llvmraytracing/test/dx/wrong-system-data.ll
+++ b/llvmraytracing/test/dx/wrong-system-data.ll
@@ -24,6 +24,7 @@ declare !pointeetys !31 %struct.TraversalData @_AmdAnyHit(i64, %struct.Traversal
 declare i32 @_cont_GetContinuationStackAddr() #0
 
 declare !pointeetys !33 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) #0
+declare !pointeetys !33 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 
 declare %struct.DispatchSystemData @_AmdTraversal(%struct.TraversalData) #0
 
diff --git a/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll b/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
index e2ff1964d3..21832da596 100644
--- a/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
+++ b/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
@@ -8,7 +8,7 @@
 declare i64 @_AmdGetFuncAddr()
 
 declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
-
+declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !11 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
 define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !pointeetys !8 {
diff --git a/llvmraytracing/test/intrinsics/get-func-addr.ll b/llvmraytracing/test/intrinsics/get-func-addr.ll
index bc6f45ca31..ab2fb5226a 100644
--- a/llvmraytracing/test/intrinsics/get-func-addr.ll
+++ b/llvmraytracing/test/intrinsics/get-func-addr.ll
@@ -7,6 +7,7 @@ declare i64 @_AmdGetFuncAddrMyFunc()
 
 %struct.TraversalData = type { }
 
+declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !12 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind)
 
diff --git a/llvmraytracing/test/intrinsics/shader-start.ll b/llvmraytracing/test/intrinsics/shader-start.ll
index e46ed8bd77..f8fd137d75 100644
--- a/llvmraytracing/test/intrinsics/shader-start.ll
+++ b/llvmraytracing/test/intrinsics/shader-start.ll
@@ -6,6 +6,7 @@
 %struct.HitData = type { float, i32 }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
 
+declare !pointeetys !8 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 declare !pointeetys !13 i1 @_cont_ReportHit(%struct.DispatchSystemData* %data, float %t, i32 %hitKind)
 declare !pointeetys !15 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #0
@@ -21,7 +22,7 @@ define void @main() !lgc.rt.shaderstage !10 {
 ; CHECK-NEXT:    store i32 123, ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], [8 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], [8 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvmraytracing/test/lgccps/alloca-select.ll b/llvmraytracing/test/lgccps/alloca-select.ll
index 6fc00840cf..3435213e9d 100644
--- a/llvmraytracing/test/lgccps/alloca-select.ll
+++ b/llvmraytracing/test/lgccps/alloca-select.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
 ; RUN: opt --verify-each -S  -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
+
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, float)
 
 define void @test({} %state, i32 %rcr, float %arg, i32 %arg1) !lgc.cps !0 {
@@ -10,11 +13,11 @@ define void @test({} %state, i32 %rcr, float %arg, i32 %arg1) !lgc.cps !0 {
   store i32 111, ptr %p, align 4
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0)
+  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
   %tmp = fmul float %t1, %arg
   %v111 = load float, ptr %p, align 4
   %returnvalue = fmul float %tmp, %v111
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -24,7 +27,7 @@ declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
 declare float @lgc.cps.await__f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test
-; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], i32 [[ARG1:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], i32 [[ARG1:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 20)
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -42,12 +45,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], float [[T0]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0
-; CHECK-SAME: ({} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: ({} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 20)
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -64,6 +67,6 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[V111:%.*]] = load float, ptr addrspace(32) [[P1]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP]], [[V111]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 20)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/await-if-else.ll b/llvmraytracing/test/lgccps/await-if-else.ll
index 331999eb9e..126ee0605c 100644
--- a/llvmraytracing/test/lgccps/await-if-else.ll
+++ b/llvmraytracing/test/lgccps/await-if-else.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -S -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
 
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, float)
 declare !lgc.cps !0 void @callee2({}, i32, float)
 
@@ -12,16 +14,16 @@ define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %arg)
+  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %arg), !continuation.returnedRegistercount !{i32 0}
   br label %bb3
 
 bb2:
-  %t2 = call float (...) @lgc.cps.await__f32(i32 %cr2, i32 2, float %t0)
+  %t2 = call float (...) @lgc.cps.await__f32(i32 %cr2, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
   br label %bb3
 bb3:
   %t3 = phi float [%t1, %bb1], [%t2, %bb2]
   %returnvalue = fmul float %t3, %arg
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -31,7 +33,7 @@ declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
 declare float @lgc.cps.await__f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
@@ -46,17 +48,17 @@ declare void @lgc.cps.jump(...)
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], float [[ARG]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[CR2]] to ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.1)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, {} poison, i32 [[TMP3]], float [[T0]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, {} poison, i32 poison, i32 [[TMP3]], float [[T0]]), !continuation.returnedRegistercount [[META3]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
@@ -65,12 +67,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RCR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP3]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.1(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.1:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
@@ -79,6 +81,6 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RCR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP3]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/await-if.ll b/llvmraytracing/test/lgccps/await-if.ll
index e63a8b9ba8..1f2568fd0a 100644
--- a/llvmraytracing/test/lgccps/await-if.ll
+++ b/llvmraytracing/test/lgccps/await-if.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -S -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
 
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, float)
 
 define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
@@ -11,13 +13,13 @@ entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %arg)
+  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %arg), !continuation.returnedRegistercount !{i32 0}
   br label %bb2
 
 bb2:
   %t3 = phi float [%t1, %bb1], [%t0, %entry]
   %returnvalue = fmul float %t3, %arg
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -27,7 +29,7 @@ declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
 declare float @lgc.cps.await__f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
@@ -41,18 +43,18 @@ declare void @lgc.cps.jump(...)
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], float [[ARG]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T0_BB2:%.*]] = phi float [ [[T0]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[T0_BB2]], [[ARG]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
@@ -61,6 +63,6 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RCR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP3]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/await-in-loop.ll b/llvmraytracing/test/lgccps/await-in-loop.ll
index e0e5e3d7ea..7e02dcecab 100644
--- a/llvmraytracing/test/lgccps/await-in-loop.ll
+++ b/llvmraytracing/test/lgccps/await-in-loop.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -S -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
 
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, i32)
 
 define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
@@ -11,7 +13,7 @@ entry:
 
 loop:
   %ind = phi i32 [0, %entry], [%inc, %loop]
-  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, i32 %ind)
+  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, i32 %ind), !continuation.returnedRegistercount !{i32 0}
   %inc = add i32 %ind, 1
   %cond = fcmp olt float %t1, 5.0
   br i1 %cond, label %loop, label %end
@@ -19,7 +21,7 @@ loop:
 end:
   %t2 = fmul float %t1, %arg
   %returnvalue = fadd float %t2, %arg2
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -29,7 +31,7 @@ declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
 declare float @lgc.cps.await__f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 20)
 ; CHECK-NEXT:    [[ARG2_SPILL_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 2
@@ -46,12 +48,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    store i32 0, ptr addrspace(32) [[IND_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], i32 0), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 20)
 ; CHECK-NEXT:    [[IND_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 4
@@ -67,7 +69,7 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[CR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[CR_RELOAD]] to ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_RELOAD]], i32 2, {} poison, i32 [[TMP6]], i32 [[INC_LOOP]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_RELOAD]], i32 2, {} poison, i32 poison, i32 [[TMP6]], i32 [[INC_LOOP]]), !continuation.returnedRegistercount [[META3]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       end:
 ; CHECK-NEXT:    [[ARG2_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 2
@@ -79,6 +81,6 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[T2:%.*]] = fmul float [[TMP3]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fadd float [[T2]], [[ARG2_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 20)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/call-shader-i1-payload.ll b/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
index 6af154dc4d..fed10f8292 100644
--- a/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
+++ b/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
@@ -20,6 +20,8 @@ declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalDat
 ; Function Attrs: alwaysinline
 declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
 
+declare !pointeetys !1 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(ptr %data) #0 !pointeetys !1 {
   ret i32 5
@@ -84,10 +86,11 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP23]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP12:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP13:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 5, [9 x i32] poison, [2 x i32] [[TMP12]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP13:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 5, [9 x i32] poison, [2 x i32] [[TMP12]]), !continuation.returnedRegistercount [[META1]], !continuation.registercount [[META1]]
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP14:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP13]], 2
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store [2 x i32] [[TMP14]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP1]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP29:%.*]] = freeze [[STRUCT_MYPARAMS]] poison
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] [[TMP29]], ptr [[TMP1]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
@@ -106,7 +109,7 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP27:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; LOWER-RAYTRACING-PIPELINE-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [8 x i32] poison, [2 x i32] [[TMP27]]), !continuation.registercount [[META1]]
+; LOWER-RAYTRACING-PIPELINE-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [8 x i32] poison, [2 x i32] [[TMP27]]), !continuation.registercount [[META1]]
 ; LOWER-RAYTRACING-PIPELINE-NEXT:    unreachable
 ;
 ;
@@ -136,14 +139,17 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK16:%.*]] = and i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT22]], -256
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT17:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK16]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_EXT15]]
 ; SROA-NEXT:    [[DOTFCA_1_INSERT8:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT5]], i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT17]], 1
-; SROA-NEXT:    [[TMP1:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 5, [9 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT8]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; SROA-NEXT:    [[TMP1:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a2i32s(i32 2, i32 4, i32 5, [9 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT8]]), !continuation.returnedRegistercount [[META1]], !continuation.registercount [[META1]]
 ; SROA-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP1]], 2
 ; SROA-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP2]], 0
 ; SROA-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP2]], 1
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_EXTRACT_TRUNC18:%.*]] = trunc i32 [[DOTFCA_1_EXTRACT]] to i8
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_SHIFT23:%.*]] = lshr i32 [[DOTFCA_1_EXTRACT]], 8
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_TRUNC24:%.*]] = trunc i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_EXTRACT_SHIFT23]] to i24
-; SROA-NEXT:    store i1 poison, ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    [[TMP4:%.*]] = freeze [[STRUCT_MYPARAMS:%.*]] poison
+; SROA-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP4]], 0
+; SROA-NEXT:    [[DOTFCA_1_EXTRACT1:%.*]] = extractvalue [[STRUCT_MYPARAMS]] [[TMP4]], 1
+; SROA-NEXT:    store i1 [[DOTFCA_1_EXTRACT1]], ptr [[DOTSROA_5]], align 4
 ; SROA-NEXT:    store i8 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_EXTRACT_TRUNC18]], ptr [[DOTSROA_5]], align 4
 ; SROA-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [2 x i32] } [[TMP1]], 0
 ; SROA-NEXT:    [[DOTFCA_0_EXTRACT27:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], 0
@@ -158,6 +164,6 @@ attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem:
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK:%.*]] = and i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_16_4_INSERT_INSERT]], -256
 ; SROA-NEXT:    [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT:%.*]] = or i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_MASK]], [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_EXT]]
 ; SROA-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_SERIALIZATION_ALLOCA_SROA_8_4_INSERT_INSERT]], 1
-; SROA-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT26]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META1]]
+; SROA-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT26]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META1]]
 ; SROA-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/cleanup-store-loads.ll b/llvmraytracing/test/lgccps/cleanup-store-loads.ll
index 684c24a284..ade1a8367f 100644
--- a/llvmraytracing/test/lgccps/cleanup-store-loads.ll
+++ b/llvmraytracing/test/lgccps/cleanup-store-loads.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -S -o - -passes='cgscc(inline),cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
 
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, float)
 
 declare i64 @getVal64()
@@ -113,7 +115,7 @@ entry:
 
 bb1:                                              ; preds = %entry
   %2 = inttoptr i32 %cr to ptr
-  %3 = call ptr %2(i32 %cr, i32 2, float %arg)
+  %3 = call ptr %2(i32 %cr, i32 2, float %arg), !continuation.returnedRegistercount !{i32 0}
   %4 = insertvalue { ptr, ptr } undef, ptr @test.resume.0, 0
   %5 = insertvalue { ptr, ptr } %4, ptr %3, 1
   ret { ptr, ptr } %5
@@ -157,7 +159,7 @@ bb2:                                              ; preds = %entry
   ; Multiple loads can be optimized away
   call void @loadAtOffsetI32(ptr %data, i32 48)
 
-  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -170,7 +172,7 @@ entryresume.0:
   %rcr.reload.addr = getelementptr inbounds %test.Frame, ptr %2, i32 0, i32 0
   %rcr.reload = load i32, ptr %rcr.reload.addr, align 4
   %returnvalue = fmul float %3, %arg.reload
-  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr.reload, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -259,7 +261,7 @@ attributes #4 = { alwaysinline }
 ;
 ;
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 408)
 ; CHECK-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
@@ -315,7 +317,7 @@ attributes #4 = { alwaysinline }
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], float [[ARG]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], float [[ARG]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T0_BB2:%.*]] = phi float [ [[T0]], [[ENTRY:%.*]] ]
@@ -346,12 +348,12 @@ attributes #4 = { alwaysinline }
 ; CHECK-NEXT:    call void @useVal32(i32 [[VAL_I23]])
 ; CHECK-NEXT:    call void @useVal32(i32 [[VAL_I23]])
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 408)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 408)
 ; CHECK-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
@@ -360,6 +362,6 @@ attributes #4 = { alwaysinline }
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RCR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP3]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 408)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/entry-point-with-cps.ll b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
index c974db97f7..9be2242ee7 100644
--- a/llvmraytracing/test/lgccps/entry-point-with-cps.ll
+++ b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
@@ -6,6 +6,8 @@
 ; Details of the output are likely to differ from the final production pass,
 ; especially instruction order and value names.
 
+!lgc.cps.module = !{}
+
 declare void @lgc.cps.complete()
 
 define void @_cont_KernelEntry() #0 !lgc.rt.shaderstage !{i32 7} {
@@ -23,7 +25,7 @@ define spir_func void @raygen({} %state, i32 %rcr) !lgc.shaderstage !{i32 7} !lg
 
   %cr.0 = ptrtoint ptr %fn to i32
   %cr.1 = or i32 %cr.0, 2
-  %r = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 %cr.1, i32 4, i32 %x, ptr addrspace(1) %dst)
+  %r = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 %cr.1, i32 4, i32 %x, ptr addrspace(1) %dst), !continuation.returnedRegistercount !{i32 0}
 
   store [2 x i32] %r, ptr addrspace(1) %dst
 
@@ -38,13 +40,13 @@ define spir_func void @chs({} %state, i32 %rcr, i32 %x) !lgc.shaderstage !{i32 7
 
   %cr.0 = ptrtoint ptr %fn to i32
   %cr.1 = or i32 %cr.0, 1
-  %y = call i32 (...) @lgc.cps.await__i32(i32 %cr.1, i32 2, i32 %x)
+  %y = call i32 (...) @lgc.cps.await__i32(i32 %cr.1, i32 2, i32 %x), !continuation.returnedRegistercount !{i32 0}
 
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 5, i32 %y)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 5, i32 %y, i32 poison, i32 poison)
   unreachable
 }
 
-define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{i32 7} {
+define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{i32 7} !lgc.rt.shaderstage !{i32 7} {
 entry:
   %id = call <3 x i32> @lgc.shader.input.LocalInvocationId(i32 0)
   %id0 = extractelement <3 x i32> %id, i32 0
@@ -56,7 +58,7 @@ main:
   %fn = load ptr, ptr addrspace(4) %pushconst
 
   %cr.0 = ptrtoint ptr %fn to i32
-  call void (...) @lgc.cps.await__isVoid(i32 %cr.0, i32 1, i32 5)
+  call void (...) @lgc.cps.jump(i32 %cr.0, i32 1, i32 5)
 
   br label %exit
 
@@ -72,11 +74,14 @@ declare void @lgc.cps.await__isVoid(...)
 declare i32 @lgc.cps.await__i32(...)
 declare [2 x i32] @lgc.cps.await__a2i32(...)
 declare void @lgc.cps.jump(...)
+
 ; CHECK-LABEL: define void @_cont_KernelEntry(
+; CHECK-SAME: ) !lgc.rt.shaderstage [[META0:![0-9]+]] {
 ; CHECK-NEXT:    ret void
-
+;
+;
 ; CHECK-LABEL: define spir_func void @raygen(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]]) !lgc.shaderstage [[META0:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.state [[META1]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
 ; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
@@ -88,12 +93,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR_1:%.*]] = or i32 [[CR_0]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR_1]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @raygen.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 4, {} poison, i32 [[TMP1]], i32 [[X]], ptr addrspace(1) [[DST]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 4, {} poison, i32 poison, i32 [[TMP1]], i32 [[X]], ptr addrspace(1) [[DST]]), !continuation.returnedRegistercount [[META1]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @raygen.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], [2 x i32] [[TMP3:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META1]] !continuation [[META2]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], [2 x i32] [[TMP3:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META1]] !continuation [[META2]] !continuation.registercount [[META1]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[PUSHCONST3:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
 ; CHECK-NEXT:    [[P162:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST3]], i32 16
@@ -103,7 +108,7 @@ declare void @lgc.cps.jump(...)
 ;
 ;
 ; CHECK-LABEL: define spir_func void @chs(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[X:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[X:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] !continuation.stacksize [[META5:![0-9]+]] !continuation.state [[META5]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[RCR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CHS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -114,23 +119,23 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR_1:%.*]] = or i32 [[CR_0]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR_1]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @chs.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 2, {} poison, i32 [[TMP1]], i32 [[X]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_1]], i32 2, {} poison, i32 poison, i32 [[TMP1]], i32 [[X]]), !continuation.returnedRegistercount [[META1]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @chs.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META3]] !continuation [[META4]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META3]] !continuation [[META4]] !continuation.registercount [[META1]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[RCR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CHS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RCR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 5, i32 [[TMP3]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 5, i32 [[TMP3]], i32 poison, i32 poison)
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dllexport void @lgc.shader.CS.main(
-; CHECK-SAME: ) !lgc.shaderstage [[META0]] !continuation [[META5:![0-9]+]] {
+; CHECK-SAME: ) !lgc.rt.shaderstage [[META0]] !lgc.shaderstage [[META0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ID:%.*]] = call <3 x i32> @lgc.shader.input.LocalInvocationId(i32 0)
 ; CHECK-NEXT:    [[ID0:%.*]] = extractelement <3 x i32> [[ID]], i32 0
@@ -140,15 +145,20 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32)
 ; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; CHECK-NEXT:    [[CR_0:%.*]] = ptrtoint ptr [[FN]] to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR_0]] to ptr
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_0]], i32 1, {} poison, i32 poison, i32 5)
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_0]], i32 1, i32 5)
+; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ;
+; LOWER-AWAIT-LABEL: define void @_cont_KernelEntry(
+; LOWER-AWAIT-SAME: ) !lgc.rt.shaderstage [[META0:![0-9]+]] {
+; LOWER-AWAIT-NEXT:    call void @lgc.cps.complete()
+; LOWER-AWAIT-NEXT:    unreachable
+;
+;
 ; LOWER-AWAIT-LABEL: define spir_func { ptr, ptr } @raygen(
-; LOWER-AWAIT-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], ptr [[TMP0:%.*]]) !lgc.shaderstage [[META0:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
+; LOWER-AWAIT-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], ptr [[TMP0:%.*]]) !lgc.shaderstage [[META0]] !lgc.cps [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
 ; LOWER-AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.raygen, ptr @continuation.malloc, ptr @continuation.free)
 ; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
 ; LOWER-AWAIT-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
@@ -160,7 +170,7 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[CR_0:%.*]] = ptrtoint ptr [[FN]] to i32
 ; LOWER-AWAIT-NEXT:    [[CR_1:%.*]] = or i32 [[CR_0]], 2
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR_1]] to ptr
-; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR_1]], i32 4, i32 [[X]], ptr addrspace(1) [[DST]])
+; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR_1]], i32 4, i32 [[X]], ptr addrspace(1) [[DST]]), !continuation.returnedRegistercount [[META1]]
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
 ; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call [2 x i32] @lgc.ilcps.getReturnValue__a2i32()
 ; LOWER-AWAIT-NEXT:    store [2 x i32] [[TMP7]], ptr addrspace(1) [[DST]], align 4
@@ -177,18 +187,16 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[CR_0:%.*]] = ptrtoint ptr [[FN]] to i32
 ; LOWER-AWAIT-NEXT:    [[CR_1:%.*]] = or i32 [[CR_0]], 1
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR_1]] to ptr
-; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR_1]], i32 2, i32 [[X]])
+; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR_1]], i32 2, i32 [[X]]), !continuation.returnedRegistercount [[META1]]
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
 ; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call i32 @lgc.ilcps.getReturnValue__i32()
-; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 5, i32 [[TMP7]])
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 5, i32 [[TMP7]], i32 poison, i32 poison)
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
 ;
-; LOWER-AWAIT-LABEL: define dllexport { ptr, ptr } @lgc.shader.CS.main(
-; LOWER-AWAIT-SAME: ptr [[TMP0:%.*]]) !lgc.shaderstage [[META0]] !continuation [[META5:![0-9]+]] {
+; LOWER-AWAIT-LABEL: define dllexport void @lgc.shader.CS.main(
+; LOWER-AWAIT-SAME: ) !lgc.rt.shaderstage [[META0]] !lgc.shaderstage [[META0]] {
 ; LOWER-AWAIT-NEXT:  entry:
-; LOWER-AWAIT-NEXT:    [[TMP1:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.lgc.shader.CS.main, ptr @continuation.malloc, ptr @continuation.free)
-; LOWER-AWAIT-NEXT:    [[TMP2:%.*]] = call ptr @llvm.coro.begin(token [[TMP1]], ptr null)
 ; LOWER-AWAIT-NEXT:    [[ID:%.*]] = call <3 x i32> @lgc.shader.input.LocalInvocationId(i32 0)
 ; LOWER-AWAIT-NEXT:    [[ID0:%.*]] = extractelement <3 x i32> [[ID]], i32 0
 ; LOWER-AWAIT-NEXT:    [[LIVE:%.*]] = icmp ult i32 [[ID0]], 29
@@ -197,9 +205,7 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32)
 ; LOWER-AWAIT-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
 ; LOWER-AWAIT-NEXT:    [[CR_0:%.*]] = ptrtoint ptr [[FN]] to i32
-; LOWER-AWAIT-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[CR_0]] to ptr
-; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = call ptr [[TMP3]](i32 [[CR_0]], i32 1, i32 5)
-; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP4]])
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR_0]], i32 1, i32 5)
 ; LOWER-AWAIT-NEXT:    br label [[EXIT]]
 ; LOWER-AWAIT:       exit:
 ; LOWER-AWAIT-NEXT:    call void @lgc.cps.complete()
diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll
index 0ce825d1c1..9cd5333716 100644
--- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll
+++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll
@@ -7,6 +7,7 @@
 ; Need _cont_ReportHit to get system data type
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
+declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare i64 @_AmdGetCurrentFuncAddr()
@@ -26,13 +27,13 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    store i32 [[VAL]], ptr @debug_global, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5, [42 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 poison, i32 5, [42 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 .entry:
   %val = call i32 @_AmdContPayloadRegistersGetI32(i32 2)
   store i32 %val, i32* @debug_global, align 4
-  call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5)
+  call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 poison, i32 5)
   unreachable
 }
 
diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
index 279620cd4c..b813925255 100644
--- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
+++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll
@@ -7,6 +7,7 @@
 ; Need _cont_ReportHit to get system data type
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
+declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare i64 @_AmdGetCurrentFuncAddr()
@@ -24,13 +25,13 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-NEXT:    store { { i32 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    store i32 11, ptr @debug_global, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load [11 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5, [42 x i32] poison, [11 x i32] [[TMP0]]), !continuation.registercount [[META1:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 poison, i32 5, [42 x i32] poison, [11 x i32] [[TMP0]]), !continuation.registercount [[META1:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 .entry:
   %val = call i32 @_AmdContPayloadRegistersI32Count()
   store i32 %val, i32* @debug_global, align 4
-  call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5)
+  call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 poison, i32 5)
   unreachable
 }
 
diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll
index 8e8fbf3034..57d1b59bba 100644
--- a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll
+++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll
@@ -7,6 +7,7 @@
 ; Need _cont_ReportHit to get system data type
 declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
+declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare i64 @_AmdGetCurrentFuncAddr()
@@ -23,12 +24,12 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 3
 ; CHECK-NEXT:    store i32 42, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5, [42 x i32] poison, [30 x i32] [[TMP1]]), !continuation.registercount [[META0:![0-9]+]]
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 poison, i32 5, [42 x i32] poison, [30 x i32] [[TMP1]]), !continuation.registercount [[META0:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 .entry:
   call void @_AmdContPayloadRegistersSetI32(i32 3, i32 42)
-  call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5)
+  call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 poison, i32 5)
   unreachable
 }
 
diff --git a/llvmraytracing/test/lgccps/lower-traversal.ll b/llvmraytracing/test/lgccps/lower-traversal.ll
index cef8cab9d4..90c214fed8 100644
--- a/llvmraytracing/test/lgccps/lower-traversal.ll
+++ b/llvmraytracing/test/lgccps/lower-traversal.ll
@@ -9,6 +9,7 @@
 ; Need _cont_ReportHit to get system data type
 declare  !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
+declare !pointeetys !10 <3 x i32> @_cont_DispatchRaysIndex3(%struct.DispatchSystemData*)
 declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare i64 @_AmdGetCurrentFuncAddr()
@@ -123,7 +124,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7
 ; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8
 ; CHECK-ATTRSIZE-16-NEXT:    [[TMP109:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [7 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0:![0-9]+]]
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 poison, i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [7 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0:![0-9]+]]
 ; CHECK-ATTRSIZE-16-NEXT:    unreachable
 ; CHECK-ATTRSIZE-16:       68:
 ; CHECK-ATTRSIZE-16-NEXT:    [[TMP68:%.*]] = shl i32 [[DOTFR]], 3
@@ -200,7 +201,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7
 ; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8
 ; CHECK-ATTRSIZE-16-NEXT:    [[TMP108:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [7 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [7 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]]
 ; CHECK-ATTRSIZE-16-NEXT:    unreachable
 ; CHECK-ATTRSIZE-16:       109:
 ; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ]
@@ -208,7 +209,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0
 ; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1
 ; CHECK-ATTRSIZE-16-NEXT:    [[TMP110:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [34 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [34 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]]
 ; CHECK-ATTRSIZE-16-NEXT:    unreachable
 ;
 ; CHECK-ATTRSIZE-8-LABEL: define dso_local spir_func void @_cont_Traversal(
@@ -320,7 +321,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7
 ; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8
 ; CHECK-ATTRSIZE-8-NEXT:    [[TMP109:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [5 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0:![0-9]+]]
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 poison, i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [5 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0:![0-9]+]]
 ; CHECK-ATTRSIZE-8-NEXT:    unreachable
 ; CHECK-ATTRSIZE-8:       68:
 ; CHECK-ATTRSIZE-8-NEXT:    [[TMP68:%.*]] = shl i32 [[DOTFR]], 3
@@ -397,7 +398,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7
 ; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8
 ; CHECK-ATTRSIZE-8-NEXT:    [[TMP108:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [5 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [5 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]]
 ; CHECK-ATTRSIZE-8-NEXT:    unreachable
 ; CHECK-ATTRSIZE-8:       109:
 ; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ]
@@ -405,7 +406,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
 ; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0
 ; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1
 ; CHECK-ATTRSIZE-8-NEXT:    [[TMP110:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4
-; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [32 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]]
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [32 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]]
 ; CHECK-ATTRSIZE-8-NEXT:    unreachable
 ;
 .entry:
@@ -514,7 +515,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
   %.fca.2.6.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.5.insert, i32 %41, 2, 6
   %.fca.2.7.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.6.insert, i32 %43, 2, 7
   %.fca.2.8.insert = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.7.insert, i64 %45, 2, 8
-  call void (...) @lgc.cps.jump(i32 %.sroa.0128.0.extract.trunc, i32 -1, {} poison, i32 %.sroa.0130.0.extract.trunc, i32 %.0, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.8.insert)
+  call void (...) @lgc.cps.jump(i32 %.sroa.0128.0.extract.trunc, i32 -1, {} poison, i32 %.sroa.0130.0.extract.trunc, i32 poison, i32 %.0, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.8.insert)
   unreachable
 
 68:                                               ; preds = %.entry
@@ -593,7 +594,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
   %.fca.2.6.insert341 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.5.insert340, i32 %41, 2, 6
   %.fca.2.7.insert342 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.6.insert341, i32 %43, 2, 7
   %.fca.2.8.insert343 = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.7.insert342, i64 %45, 2, 8
-  call void (...) @lgc.cps.jump(i32 %.sroa.0150.0.vec.extract, i32 -1, {} poison, i32 %.sroa.0320.0.extract.trunc, i32 %84, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.8.insert343)
+  call void (...) @lgc.cps.jump(i32 %.sroa.0150.0.vec.extract, i32 -1, {} poison, i32 poison, i32 %.sroa.0320.0.extract.trunc, i32 %84, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } %.fca.2.8.insert343)
   unreachable
 
 106:                                              ; preds = %.exit5, %.exit2
@@ -601,7 +602,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
   %.sroa.0373.0.extract.trunc = trunc i64 %45 to i32
   %.fca.0.insert = insertvalue { <3 x i32>, i32 } poison, <3 x i32> %3, 0
   %.fca.1.insert = insertvalue { <3 x i32>, i32 } %.fca.0.insert, i32 %.sroa.7.0, 1
-  call void (...) @lgc.cps.jump(i32 %.sroa.0373.0.extract.trunc, i32 -1, {} poison, i32 poison, i32 %.sroa.7.0, { <3 x i32>, i32 } %.fca.1.insert)
+  call void (...) @lgc.cps.jump(i32 %.sroa.0373.0.extract.trunc, i32 -1, {} poison, i32 poison, i32 poison, i32 %.sroa.7.0, { <3 x i32>, i32 } %.fca.1.insert)
   unreachable
 }
 
diff --git a/llvmraytracing/test/lgccps/multiple-await.ll b/llvmraytracing/test/lgccps/multiple-await.ll
index 0539843a52..266559bced 100644
--- a/llvmraytracing/test/lgccps/multiple-await.ll
+++ b/llvmraytracing/test/lgccps/multiple-await.ll
@@ -1,18 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -S -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
 
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, float)
 declare !lgc.cps !0 void @callee2({}, i32, float)
 
 define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0)
+  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
   %t2 = fmul float %t1, %arg
   %cr2 = call i32 @lgc.cps.as.continuation.reference(ptr @callee2)
-  %t3 = call float (...) @lgc.cps.await__f32(i32 %cr2, i32 2, float %t2)
+  %t3 = call float (...) @lgc.cps.await__f32(i32 %cr2, i32 2, float %t2), !continuation.returnedRegistercount !{i32 0}
   %returnvalue = fadd float %t3, %arg2
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -22,7 +24,7 @@ declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
 declare float @lgc.cps.await__f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 12)
 ; CHECK-NEXT:    [[ARG2_SPILL_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 2
@@ -35,12 +37,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], float [[T0]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 12)
 ; CHECK-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
@@ -49,12 +51,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR2:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[CR2]] to ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.1)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, {} poison, i32 [[TMP6]], float [[T2]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR2]], i32 2, {} poison, i32 poison, i32 [[TMP6]], float [[T2]]), !continuation.returnedRegistercount [[META3]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.1(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.1:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 12)
 ; CHECK-NEXT:    [[ARG2_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 2
@@ -63,6 +65,6 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RCR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fadd float [[TMP3]], [[ARG2_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 12)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/simple-await-more-state.ll b/llvmraytracing/test/lgccps/simple-await-more-state.ll
index 61f3280346..095da2a378 100644
--- a/llvmraytracing/test/lgccps/simple-await-more-state.ll
+++ b/llvmraytracing/test/lgccps/simple-await-more-state.ll
@@ -1,15 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -S -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
 
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, float)
 
 define void @test({} %state, i32 %rcr, float %arg, float %arg2) !lgc.cps !0 {
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0)
+  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
   %t2 = fmul float %t1, %arg
   %returnvalue = fadd float %t2, %arg2
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -19,7 +21,7 @@ declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
 declare float @lgc.cps.await__f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]], float [[ARG2:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 12)
 ; CHECK-NEXT:    [[ARG2_SPILL_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 2
@@ -32,12 +34,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], float [[T0]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 12)
 ; CHECK-NEXT:    [[ARG2_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 2
@@ -49,6 +51,6 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[T2:%.*]] = fmul float [[TMP3]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fadd float [[T2]], [[ARG2_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 12)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/simple-await.ll b/llvmraytracing/test/lgccps/simple-await.ll
index 613ee1a72b..1bd79540a3 100644
--- a/llvmraytracing/test/lgccps/simple-await.ll
+++ b/llvmraytracing/test/lgccps/simple-await.ll
@@ -1,14 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -S  -o - -passes='lower-await,coro-early,lgc-coro-split,coro-cleanup,cleanup-continuations' %s | FileCheck --check-prefixes=CHECK %s
 ; RUN: opt --verify-each -S  -o - -passes='lower-await' %s | FileCheck --check-prefixes=LOWER-AWAIT %s
+
+!lgc.cps.module = !{}
+
 declare !lgc.cps !0 void @callee({}, i32, float)
 
 define void @test({} %state, i32 %rcr, float %arg) !lgc.cps !0 {
   %t0 = fadd float %arg, 1.0
   %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
-  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0)
+  %t1 = call float (...) @lgc.cps.await__f32(i32 %cr, i32 2, float %t0), !continuation.returnedRegistercount !{i32 0}
   %returnvalue = fmul float %t1, %arg
-  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, float %returnvalue)
+  call void (...) @lgc.cps.jump(i32 %rcr, i32 2, {} poison, i32 poison, i32 poison, float %returnvalue)
   unreachable
 }
 
@@ -18,7 +21,7 @@ declare i32 @lgc.cps.as.continuation.reference(...) memory(none)
 declare float @lgc.cps.await__f32(...)
 declare void @lgc.cps.jump(...)
 ; CHECK-LABEL: define void @test(
-; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] {
+; CHECK-SAME: {} [[STATE:%.*]], i32 [[RCR:%.*]], float [[ARG:%.*]]) !lgc.cps [[META0:![0-9]+]] !continuation [[META1:![0-9]+]] !continuation.stacksize [[META2:![0-9]+]] !continuation.state [[META2]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
@@ -29,12 +32,12 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i32 [[CR]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @test.resume.0)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 [[TMP1]], float [[T0]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[CR]], i32 2, {} poison, i32 poison, i32 [[TMP1]], float [[T0]]), !continuation.returnedRegistercount [[META3:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define dso_local void @test.resume.0(
-; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] {
+; CHECK-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], float [[TMP3:%.*]]) !lgc.cps [[META0]] !continuation [[META1]] !continuation.registercount [[META3]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[TEST_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 1
@@ -43,7 +46,7 @@ declare void @lgc.cps.jump(...)
 ; CHECK-NEXT:    [[RCR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RCR_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP3]], [[ARG_RELOAD]]
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR_RELOAD]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; CHECK-NEXT:    unreachable
 ;
 ;
@@ -54,10 +57,10 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[T0:%.*]] = fadd float [[ARG]], 1.000000e+00
 ; LOWER-AWAIT-NEXT:    [[CR:%.*]] = call i32 @lgc.cps.as.continuation.reference(ptr @callee)
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR]] to ptr
-; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR]], i32 2, float [[T0]])
+; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR]], i32 2, float [[T0]]), !continuation.returnedRegistercount [[META2:![0-9]+]]
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
 ; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call float @lgc.ilcps.getReturnValue__f32()
 ; LOWER-AWAIT-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP7]], [[ARG]]
-; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, {} poison, i32 poison, i32 poison, float [[RETURNVALUE]])
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index 19d92ea98e..73a9460511 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -955,12 +955,13 @@ void PipelineDumper::dumpPipelineOptions(const PipelineOptions *options, std::os
            << "\n";
   dumpFile << glStatePrefix << "enableFragColor = " << options->getGlState().enableFragColor << "\n";
   dumpFile << glStatePrefix << "disableBaseVertex = " << options->getGlState().disableBaseVertex << "\n";
-  dumpFile << glStatePrefix << "enablePrimGeneratedQuery = " << options->enablePrimGeneratedQuery << "\n";
-  dumpFile << glStatePrefix << "disablePerCompFetch = " << options->disablePerCompFetch << "\n";
   dumpFile << glStatePrefix << "enablePolygonStipple = " << options->getGlState().enablePolygonStipple << "\n";
   dumpFile << glStatePrefix << "enableLineSmooth = " << options->getGlState().enableLineSmooth << "\n";
   dumpFile << glStatePrefix << "emulateWideLineStipple = " << options->getGlState().emulateWideLineStipple << "\n";
   dumpFile << glStatePrefix << "enablePointSmooth = " << options->getGlState().enablePointSmooth << "\n";
+  dumpFile << "options.enablePrimGeneratedQuery = " << options->enablePrimGeneratedQuery << "\n";
+  dumpFile << "options.disablePerCompFetch = " << options->disablePerCompFetch << "\n";
+  dumpFile << "options.optimizePointSizeWrite = " << options->optimizePointSizeWrite << "\n";
 
   // Output compile time constant info
   if (options->compileConstInfo) {
@@ -1943,6 +1944,7 @@ void PipelineDumper::updateHashForPipelineOptions(const PipelineOptions *options
   hasher->Update(options->getGlState().emulateWideLineStipple);
   hasher->Update(options->getGlState().enablePointSmooth);
   // disablePerCompFetch has been handled in updateHashForNonFragmentState
+  hasher->Update(options->optimizePointSizeWrite);
 }
 
 // =====================================================================================================================
diff --git a/tool/vfx/vfxVkSection.h b/tool/vfx/vfxVkSection.h
index f36c25ac15..147670f5be 100644
--- a/tool/vfx/vfxVkSection.h
+++ b/tool/vfx/vfxVkSection.h
@@ -541,6 +541,7 @@ class SectionPipelineOption : public Section {
 #endif
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionPipelineOption, enablePrimGeneratedQuery, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionPipelineOption, disablePerCompFetch, MemberTypeBool, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionPipelineOption, optimizePointSizeWrite, MemberTypeBool, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionPipelineOption, m_compileTimeConstants, MemberTypeCompileConstInfo, true);
       return addrTableInitializer;
     }();
diff --git a/util/extensions.txt b/util/extensions.txt
index 36a5c07005..8bd2f7b5fe 100644
--- a/util/extensions.txt
+++ b/util/extensions.txt
@@ -44,6 +44,7 @@ SPV_KHR_workgroup_memory_explicit_layout
 SPV_KHR_cooperative_matrix
 SPV_NV_shader_atomic_float
 SPV_NV_compute_shader_derivatives
+SPV_KHR_compute_shader_derivatives
 SPV_KHR_maximal_reconvergence
 SPV_KHR_expect_assume
 SPV_KHR_shader_quad_control
diff --git a/util/gpurtshim/GpurtShim.cpp b/util/gpurtshim/GpurtShim.cpp
index 12f2265c8a..834ce78554 100644
--- a/util/gpurtshim/GpurtShim.cpp
+++ b/util/gpurtshim/GpurtShim.cpp
@@ -51,9 +51,7 @@ static Pal::RayTracingIpLevel getRtIpLevel(RtIpVersion rtIpVersion) {
       {{0, 0}, Pal::RayTracingIpLevel::_None},
       {{1, 0}, Pal::RayTracingIpLevel::RtIp1_0},
       {{1, 1}, Pal::RayTracingIpLevel::RtIp1_0},
-#if PAL_BUILD_GFX11
       {{2, 0}, Pal::RayTracingIpLevel::RtIp2_0},
-#endif
   };
   // clang-format on
 
diff --git a/util/vkgcExtension.cpp b/util/vkgcExtension.cpp
index 3c6becfecf..bd32b52fe8 100644
--- a/util/vkgcExtension.cpp
+++ b/util/vkgcExtension.cpp
@@ -103,6 +103,7 @@ const ExtensionNamePair ExtensionNameTable[ExtensionCount] = {
     DeclExtensionName(KHR_EXPECT_ASSUME),
     DeclExtensionName(KHR_SHADER_QUAD_CONTROL),
     DeclExtensionName(KHR_SUBGROUP_ROTATE),
+    DeclExtensionName(KHR_COMPUTE_SHADER_DERIVATIVES),
 };
 
 // =====================================================================================================================
diff --git a/util/vkgcExtension.h b/util/vkgcExtension.h
index 505c3d6f49..80874af8c2 100644
--- a/util/vkgcExtension.h
+++ b/util/vkgcExtension.h
@@ -85,6 +85,7 @@ enum Extension : unsigned {
   KHR_EXPECT_ASSUME,
   KHR_SHADER_QUAD_CONTROL,
   KHR_SUBGROUP_ROTATE,
+  KHR_COMPUTE_SHADER_DERIVATIVES,
   ExtensionCount,
 };
 
diff --git a/version/CMakeLists.txt b/version/CMakeLists.txt
index 23bb621cd3..b00c12373c 100644
--- a/version/CMakeLists.txt
+++ b/version/CMakeLists.txt
@@ -41,19 +41,7 @@ endif()
 target_compile_definitions(llpc_version INTERFACE
   LLPC_CLIENT_INTERFACE_MAJOR_VERSION=${LLPC_CLIENT_INTERFACE_MAJOR_VERSION}
   # Hardcode the endian-ness define. Our register headers expect it anyway
-  LITTLEENDIAN_CPU
-  CHIP_HDR_NAVI14
-  CHIP_HDR_NAVI21
-  CHIP_HDR_NAVI22
-  CHIP_HDR_NAVI23
-  CHIP_HDR_NAVI24
-#if VKI_BUILD_NAVI31
-  CHIP_HDR_NAVI31
-#endif
-#if VKI_BUILD_NAVI33
-  CHIP_HDR_NAVI33
-#endif
-  CHIP_HDR_RENOIR)
+  LITTLEENDIAN_CPU)
 
 # Mark llpc_version as an installable target for which *.cmake files are created
 # that a hypothetical external user could include to import the target, either
@@ -76,57 +64,15 @@ if (NOT DISABLE_LLPC_VERSION_USES_LLVM)
 endif()
 
 ### Cached Config-related Options ######################################################################################
-#if VKI_BUILD_NAVI12
-option(LLPC_BUILD_NAVI12 "LLPC support for NAVI12?" ON)
-if (LLPC_BUILD_NAVI12)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_NAVI12 CHIP_HDR_NAVI12)
-endif()
-#endif
-#if VKI_BUILD_NAVI32
-option(LLPC_BUILD_NAVI32 "LLPC support for NAVI32?" ON)
-if (LLPC_BUILD_NAVI32)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_NAVI32 CHIP_HDR_NAVI32)
-endif()
-#endif
-#if VKI_BUILD_REMBRANDT
-option(LLPC_BUILD_REMBRANDT "LLPC support for REMBRANDT?" ON)
-if (LLPC_BUILD_REMBRANDT)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_REMBRANDT CHIP_HDR_REMBRANDT)
-endif()
-#endif
-#if VKI_BUILD_RAPHAEL
-option(LLPC_BUILD_RAPHAEL "LLPC support for RAPHAEL?" ON)
-if (LLPC_BUILD_RAPHAEL)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_RAPHAEL CHIP_HDR_RAPHAEL)
-endif()
-#endif
-#if VKI_BUILD_MENDOCINO
-option(LLPC_BUILD_MENDOCINO "LLPC support for MENDOCINO?" ON)
-if (LLPC_BUILD_MENDOCINO)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_MENDOCINO CHIP_HDR_MENDOCINO)
-endif()
-#endif
-#if VKI_BUILD_PHOENIX1
-option(LLPC_BUILD_PHOENIX1 "LLPC support for PHOENIX1?" ON)
-if (LLPC_BUILD_PHOENIX1)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_PHOENIX1 CHIP_HDR_PHOENIX1)
-endif()
-#endif
-#if VKI_BUILD_PHOENIX2
-option(LLPC_BUILD_PHOENIX2 "LLPC support for PHOENIX2?" OFF)
-if (LLPC_BUILD_PHOENIX2)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_PHOENIX2 CHIP_HDR_PHOENIX2)
-endif()
-#endif
 #if VKI_BUILD_STRIX1
 option(LLPC_BUILD_STRIX1 "LLPC support for STRIX1?" ON)
 if (LLPC_BUILD_STRIX1)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_STRIX1 CHIP_HDR_STRIX1)
+  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_STRIX1)
 endif()
 #endif
 #if VKI_BUILD_GFX115
 option(LLPC_BUILD_GFX115 "LLPC support for GFX11.5?" ON)
 if (LLPC_BUILD_GFX115)
-  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_GFX115 CHIP_HDR_GFX115)
+  target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_GFX115)
 endif()
 #endif
diff --git a/version/include/llpc/GpurtIntrinsics.h b/version/include/llpc/GpurtIntrinsics.h
index 54dbb71660..911d054747 100644
--- a/version/include/llpc/GpurtIntrinsics.h
+++ b/version/include/llpc/GpurtIntrinsics.h
@@ -62,8 +62,6 @@
 #endif
 #endif
 
-#define CONTINUATIONS_LGC_STACK_LOWERING 1
-
 //=====================================================================================================================
 // Continuation intrinsics
 //
@@ -157,6 +155,11 @@ GPURT_DECL uint64_t _AmdGetResumePointAddr() DUMMY_GENERIC_FUNC(0)
 // Returns the address of the caller function making this intrinsic call, after inlining and continuation function splitting.
 GPURT_DECL uint64_t _AmdGetCurrentFuncAddr() DUMMY_GENERIC_FUNC(0)
 //
+// GetShaderRecordIndex
+// --------
+// Returns the shader record index of the current caller.
+GPURT_DECL uint32_t _AmdGetShaderRecordIndex() DUMMY_GENERIC_FUNC(0)
+//
 //=====================================================================================================================
 // GetShaderKind
 // Returns the kind of the shader this intrinsic is used in.
diff --git a/version/include/llpcVersion.h.in b/version/include/llpcVersion.h.in
index 773df94485..13d6bfaee0 100644
--- a/version/include/llpcVersion.h.in
+++ b/version/include/llpcVersion.h.in
@@ -37,6 +37,7 @@
 //  %Version History
 //  | %Version | Change Description                                                                                    |
 //  | -------- | ----------------------------------------------------------------------------------------------------- |
+//  |     75.5 | Add optimizePointSizeWrite to PipelineShaderOptions in order to optimize the case PointSize = 1.0.    |
 //  |     75.4 | Add disableGlPositionOpt to PipelineShaderOptions.                                                    |
 //  |     75.3 | Add enableInitUndefZero to GraphicPipelineBuildInfo                                                   |
 //  |     75.2 | Add CompileConstInfo to PipelineShaderOptions.                                                        |
@@ -196,7 +197,7 @@
 #define LLPC_INTERFACE_MAJOR_VERSION 75
 
 /// LLPC minor interface version.
-#define LLPC_INTERFACE_MINOR_VERSION 4
+#define LLPC_INTERFACE_MINOR_VERSION 5
 
 /// The client's LLPC major interface version
 #ifndef LLPC_CLIENT_INTERFACE_MAJOR_VERSION