# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention 3 kernel
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
def hf_flash_attention3(query, key, value):
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn3",
impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
impl_func=hf_flash_attention3,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 3.90% 167.264us 45.75% 1.964ms 1.964ms 0.000us 0.00% 3.551ms 3.551ms 1
FlashAttnFunc 3.34% 143.492us 41.85% 1.797ms 598.836us 0.000us 0.00% 3.551ms 1.184ms 3
_flash_attn3_48fe103_dirty::fwd 1.86% 80.044us 38.51% 1.653ms 551.005us 2.654ms 100.00% 3.551ms 1.184ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.655ms 100.05% 2.655ms 2.655ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.654ms 100.00% 2.654ms 884.532us 3
Activity Buffer Request 34.19% 1.468ms 34.19% 1.468ms 1.468ms 897.822us 33.83% 897.822us 897.822us 1
aten::empty 1.09% 46.590us 1.09% 46.590us 7.765us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.30% 12.680us 0.30% 12.680us 4.227us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.07% 45.911us 1.07% 45.911us 15.304us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 54.25% 2.329ms 54.25% 2.329ms 2.329ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.293ms
Self CUDA time total: 2.654ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.52% 108.973us 40.29% 1.745ms 1.745ms 0.000us 0.00% 3.761ms 3.761ms 1
FlashAttnFunc 2.11% 91.250us 37.77% 1.636ms 545.408us 0.000us 0.00% 3.761ms 1.254ms 3
_flash_attn3_48fe103_dirty::fwd 1.23% 53.414us 35.67% 1.545ms 514.991us 2.811ms 100.00% 3.761ms 1.254ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.813ms 100.05% 2.813ms 2.813ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.811ms 100.00% 2.811ms 937.084us 3
Activity Buffer Request 32.99% 1.429ms 32.99% 1.429ms 1.429ms 949.852us 33.79% 949.852us 949.852us 1
aten::empty 0.64% 27.630us 0.64% 27.630us 4.605us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 4.980us 0.11% 4.980us 1.660us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.69% 30.020us 0.69% 30.020us 10.007us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.71% 2.587ms 59.71% 2.587ms 2.587ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.332ms
Self CUDA time total: 2.811ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.50% 112.343us 38.92% 1.748ms 1.748ms 0.000us 0.00% 3.960ms 3.960ms 1
FlashAttnFunc 2.05% 91.871us 36.42% 1.636ms 545.325us 0.000us 0.00% 3.960ms 1.320ms 3
_flash_attn3_48fe103_dirty::fwd 1.14% 51.221us 34.37% 1.544ms 514.701us 2.972ms 100.00% 3.960ms 1.320ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.973ms 100.05% 2.973ms 2.973ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.972ms 100.00% 2.972ms 990.630us 3
Activity Buffer Request 31.81% 1.429ms 31.81% 1.429ms 1.429ms 987.835us 33.24% 987.835us 987.835us 1
aten::empty 0.63% 28.400us 0.63% 28.400us 4.733us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.211us 0.12% 5.211us 1.737us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.67% 30.301us 0.67% 30.301us 10.100us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 61.08% 2.744ms 61.08% 2.744ms 2.744ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.492ms
Self CUDA time total: 2.972ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.16% 102.333us 40.97% 1.945ms 1.945ms 0.000us 0.00% 4.045ms 4.045ms 1
FlashAttnFunc 1.95% 92.400us 38.81% 1.843ms 614.206us 0.000us 0.00% 4.045ms 1.348ms 3
_flash_attn3_48fe103_dirty::fwd 1.07% 50.872us 36.87% 1.750ms 583.406us 3.024ms 100.00% 4.045ms 1.348ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.026ms 100.05% 3.026ms 3.026ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.024ms 100.00% 3.024ms 1.008ms 3
Activity Buffer Request 29.88% 1.419ms 29.88% 1.419ms 1.419ms 1.021ms 33.76% 1.021ms 1.021ms 1
aten::empty 0.61% 28.961us 0.61% 28.961us 4.827us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.320us 0.11% 5.320us 1.773us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.19% 246.415us 5.19% 246.415us 82.138us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.03% 2.803ms 59.03% 2.803ms 2.803ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.747ms
Self CUDA time total: 3.024ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.48% 128.541us 37.35% 1.936ms 1.936ms 0.000us 0.00% 4.636ms 4.636ms 1
FlashAttnFunc 1.81% 93.984us 34.87% 1.807ms 602.493us 0.000us 0.00% 4.636ms 1.545ms 3
_flash_attn3_48fe103_dirty::fwd 0.96% 49.852us 33.05% 1.713ms 571.165us 3.473ms 100.00% 4.636ms 1.545ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.475ms 100.05% 3.475ms 3.475ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.473ms 100.00% 3.473ms 1.158ms 3
Activity Buffer Request 27.80% 1.441ms 27.80% 1.441ms 1.441ms 1.163ms 33.49% 1.163ms 1.163ms 1
aten::empty 0.57% 29.640us 0.57% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.160us 0.10% 5.160us 1.720us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.62% 187.873us 3.62% 187.873us 62.624us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.65% 3.248ms 62.65% 3.248ms 3.248ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.184ms
Self CUDA time total: 3.473ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.41% 121.192us 36.39% 1.829ms 1.829ms 0.000us 0.00% 4.566ms 4.566ms 1
FlashAttnFunc 1.84% 92.271us 33.97% 1.707ms 569.139us 0.000us 0.00% 4.566ms 1.522ms 3
_flash_attn3_48fe103_dirty::fwd 1.00% 50.242us 32.14% 1.615ms 538.382us 3.416ms 100.00% 4.566ms 1.522ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.417ms 100.04% 3.417ms 3.417ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.416ms 100.00% 3.416ms 1.139ms 3
Activity Buffer Request 27.08% 1.361ms 27.08% 1.361ms 1.361ms 1.150ms 33.68% 1.150ms 1.150ms 1
aten::empty 0.60% 30.030us 0.60% 30.030us 5.005us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.061us 0.10% 5.061us 1.687us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.36% 168.913us 3.36% 168.913us 56.304us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.61% 3.197ms 63.61% 3.197ms 3.197ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.026ms
Self CUDA time total: 3.416ms
impl wl p50(ms) ok
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 9.18it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.23it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.82it/s]