Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into llama
Browse files Browse the repository at this point in the history
  • Loading branch information
TylunasLi committed Mar 23, 2024
2 parents 71a7856 + 0f22a2c commit ef0b7a6
Show file tree
Hide file tree
Showing 38 changed files with 2,532 additions and 302 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
./models
./build/
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@ token
/example/Android/LLMAssistant/.externalNativeBuild
/example/Android/LLMAssistant/.cxx
/example/Android/LLMAssistant/local.properties
/test/cmmlu/result/
/test/cmmlu/results/
/models/
/localtest/
9 changes: 6 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ set(CMAKE_BUILD_TYPE "Release")
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread --std=c++17 -O2")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX -O2 /std:c++17 /arch:AVX /source-charset:utf-8")
set(CMAKE_CXX_FLAGS_DEBUG "/MTd /Zi /Ob0 /Od /RTC1")
set(CMAKE_CXX_FLAGS_RELEASE "/MT /O2 /Ob1 /Gy /DNDEBUG")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX /std:c++17 /arch:AVX2 /source-charset:utf-8")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread --std=c++17 -O2 -march=native")
endif()
Expand All @@ -40,6 +42,7 @@ set(FASTLLM_CXX_SOURCES src/fastllm.cpp src/device.cpp src/model.cpp src/executo
include_directories(include)
include_directories(include/utils)
include_directories(include/models)
include_directories(include/devices/cpu)

if (USE_MMAP)
add_compile_definitions(USE_MMAP)
Expand Down Expand Up @@ -121,8 +124,8 @@ if (${CMAKE_HOST_WIN32})
COMMAND ${CMAKE_COMMAND} -E make_directory tools/fastllm_pytools
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/tools/fastllm_pytools ${CMAKE_BINARY_DIR}/tools/fastllm_pytools/.
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/tools/scripts ${CMAKE_BINARY_DIR}/tools/.
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/fastllm_tools.dll ${CMAKE_BINARY_DIR}/tools/fastllm_pytools/.
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/fastllm_tools.dll
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/$(Configuration)/fastllm_tools.dll ${CMAKE_BINARY_DIR}/tools/fastllm_pytools/.
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/$(Configuration)/fastllm_tools.dll
)
else()
add_custom_command(
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# syntax=docker/dockerfile:1-labs
FROM nvidia/cuda:11.7.1-devel-ubuntu22.04
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04

# Update Apt repositories
RUN apt-get update
Expand All @@ -12,7 +12,7 @@ RUN pip install setuptools streamlit-chat
ENV WORKDIR /fastllm

# Install cmake
RUN wget -c https://cmake.org/files/LatestRelease/cmake-3.27.0-linux-x86_64.sh && bash ./cmake-3.27.0-linux-x86_64.sh --skip-license --prefix=/usr/
RUN wget -c https://cmake.org/files/LatestRelease/cmake-3.28.3-linux-x86_64.sh && bash ./cmake-3.28.3-linux-x86_64.sh --skip-license --prefix=/usr/

WORKDIR $WORKDIR
ADD . $WORKDIR/
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ python setup.py install
我们假设已经获取了名为`model.flm`的模型(参照 [模型获取](#模型获取),初次使用可以先下载转换好的模型)

编译完成之后在build目录下可以使用下列demo:

``` sh
# 这时在fastllm/build目录下

Expand All @@ -177,6 +178,8 @@ streamlit run tools/web_demo.py model.flm

```

Windows下的编译推荐使用Cmake GUI + Visual Studio,在图形化界面中完成。

如编译中存在问题,尤其是Windows下的编译,可参考[FAQ](docs/faq.md)

### 简易python调用
Expand Down
8 changes: 7 additions & 1 deletion docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,20 @@ cmake .. -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native
> include\fastllm.h(234): error : identifier "DataDevice" is undefined
> ....
**解决办法:** 使用cmake构建通常不存在这一问题。参考 [example\README.md](/example/README.md)。签出代码后,**修改 include/fastllm.h**,Visual Studio中点击”文件“ -> "高级保存选项",在编码中选择”Unicode (UTF-8 **带签名**) -代码页 65001“,或在其他文本编辑器中转为”UTF-8 BOM“编码。(由于linux下gcc不识别BOM头,MSVC依赖BOM判断文件编码,该修改只能手动处理。)
**解决办法:** 参考 [example\README.md](/example/README.md)。签出代码后,**修改 include/fastllm.h**,Visual Studio中点击”文件“ -> "高级保存选项",在编码中选择”Unicode (UTF-8 **带签名**) -代码页 65001“,或在其他文本编辑器中转为”UTF-8 BOM“编码。(由于linux下gcc不识别BOM头,MSVC依赖BOM判断文件编码,该修改只能手动处理。)

### main.exe 无法识别中文输入

**原因:** Windows下cmd不支持UTF-8编码,

**解决办法:** 编译[Win32Demo](/example/README.md#win32demo-windows平台) 或使用 [WebUI](/example/README.md#web-ui)

### Windows(MSVC)编译下,int4出现乱码

**原因:** MSVC编译器优化选项 "`/Ob2`"、"`/Ob3`"与的现有代码冲突,

**解决办法:** 编译时,在”属性“中找到"C/C++" -> "优化" -> "内联函数扩展" 中选择“只适用于 __inline (/Ob1)”。

### 导入提示 FileNotFoundError

**现象:**
Expand Down
21 changes: 16 additions & 5 deletions example/Win32Demo/fastllm-gpu.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@
<PreprocessorDefinitions>NOMINMAX;USE_CUDA;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
Expand All @@ -120,7 +122,9 @@
<PreprocessorDefinitions>NOMINMAX;USE_CUDA;WIN64;__AVX__;__AVX2__;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
Expand All @@ -143,7 +147,11 @@
<PreprocessorDefinitions>NOMINMAX;USE_CUDA;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
<DebugInformationFormat />
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
Expand All @@ -167,7 +175,11 @@
<PreprocessorDefinitions>NOMINMAX;USE_CUDA;__AVX__;__AVX2__;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
<DebugInformationFormat />
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
Expand Down Expand Up @@ -218,7 +230,6 @@
<ClCompile Include="..\..\src\models\minicpm.cpp" />
<ClCompile Include="..\..\src\models\moss.cpp" />
<ClCompile Include="..\..\src\models\qwen.cpp" />
<ClCompile Include="..\..\src\pybinding.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\include\devices\cuda\fastllm-cuda.cuh">
Expand Down
3 changes: 0 additions & 3 deletions example/Win32Demo/fastllm-gpu.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,6 @@
<ClCompile Include="..\..\src\model.cpp">
<Filter>源文件</Filter>
</ClCompile>
<ClCompile Include="..\..\src\pybinding.cpp">
<Filter>源文件</Filter>
</ClCompile>
<ClCompile Include="..\..\src\models\basellm.cpp">
<Filter>源文件\models</Filter>
</ClCompile>
Expand Down
44 changes: 22 additions & 22 deletions example/Win32Demo/fastllm.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings" />
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
Expand Down Expand Up @@ -94,37 +93,35 @@
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>NOMINMAX;_LIB;WIN32;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>NOMINMAX;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
<SubSystem>Windows</SubSystem>
</Link>
<CudaCompile>
<CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>NOMINMAX;_LIB;__AVX__;__AVX2__;WIN64;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>NOMINMAX;WIN64;__AVX__;__AVX2__;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
<SubSystem>Windows</SubSystem>
</Link>
<CudaCompile>
<CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
Expand All @@ -134,20 +131,21 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>NOMINMAX;_LIB;WIN32;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>NOMINMAX;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
<DebugInformationFormat />
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
<SubSystem>Windows</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
<CudaCompile>
<CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
Expand All @@ -157,10 +155,14 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>NOMINMAX;_LIB;__AVX__;__AVX2__;WIN64;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>NOMINMAX;__AVX__;__AVX2__;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions>/source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
<DebugInformationFormat />
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
Expand Down Expand Up @@ -202,8 +204,6 @@
<ClCompile Include="..\..\src\models\minicpm.cpp" />
<ClCompile Include="..\..\src\models\moss.cpp" />
<ClCompile Include="..\..\src\models\qwen.cpp" />
<ClCompile Include="..\..\src\pybinding.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
3 changes: 0 additions & 3 deletions example/Win32Demo/fastllm.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,6 @@
<ClCompile Include="..\..\src\model.cpp">
<Filter>源文件</Filter>
</ClCompile>
<ClCompile Include="..\..\src\pybinding.cpp">
<Filter>源文件</Filter>
</ClCompile>
<ClCompile Include="..\..\src\models\basellm.cpp">
<Filter>源文件\models</Filter>
</ClCompile>
Expand Down
1 change: 1 addition & 0 deletions include/devices/cuda/fastllm-cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ bool FastllmCudaPermute(fastllm::Data &input, const std::vector<int> &axis);
bool FastllmCudaMatMulFloatInt8(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloatInt4(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloatInt4NoZero(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloatInt4Group(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloat32(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloat16(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaBatchMatMul(const fastllm::Data &input0, const fastllm::Data &input1, fastllm::Data &output,
Expand Down
Loading

0 comments on commit ef0b7a6

Please sign in to comment.