adding 8bit dequantization kernel for asym fine-grained block quantization in zero-inference #3122

	name: nv-pre-compile-ops

	on:
	pull_request:
	branches:
	'**'
	paths-ignore:
	- 'docs/**'
	- 'blogs/**'
	merge_group:
	branches: [ master ]
	schedule:
	- cron: "0 0 * * *"

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	build-ops:
	runs-on: ubuntu-20.04
	container:
	image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116

	steps:
	- uses: actions/checkout@v3

	- name: environment
	run: \|
	which python
	python --version
	python -c "import torch; print('torch:', torch.__version__, torch)"
	#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
	- name: Compile DeepSpeed Ops
	run: \|
	TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
	- name: DS Report
	run: \|
	ds_report

Provide feedback