# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention kernel
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
def hf_flash_attention(query, key, value):
"""HuggingFace Kernels Flash Attention"""
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn",
impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
impl_func=hf_flash_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 3.72% 162.003us 41.36% 1.801ms 1.801ms 0.000us 0.00% 3.718ms 3.718ms 1
_flash_attn_9e27194::fwd 1.69% 73.411us 37.64% 1.639ms 546.409us 2.775ms 100.00% 3.718ms 1.239ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.777ms 100.05% 2.777ms 2.777ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.775ms 100.00% 2.775ms 925.087us 3
Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 943.102us 33.98% 943.102us 943.102us 1
cudaDeviceGetAttribute 0.11% 4.701us 0.11% 4.701us 0.313us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.40% 17.630us 1.19% 51.921us 17.307us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.79% 34.291us 0.79% 34.291us 11.430us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.58% 25.250us 0.58% 25.250us 2.806us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.29% 12.441us 0.29% 12.441us 4.147us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.94% 40.982us 0.94% 40.982us 13.661us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.64% 2.554ms 58.64% 2.554ms 2.554ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.355ms
Self CUDA time total: 2.775ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.15% 96.184us 36.83% 1.645ms 1.645ms 0.000us 0.00% 3.965ms 3.965ms 1
_flash_attn_9e27194::fwd 1.07% 47.845us 34.67% 1.549ms 516.264us 2.974ms 100.00% 3.965ms 1.322ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.975ms 100.05% 2.975ms 2.975ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.974ms 100.00% 2.974ms 991.313us 3
Activity Buffer Request 31.80% 1.421ms 31.80% 1.421ms 1.421ms 990.779us 33.32% 990.779us 990.779us 1
cudaDeviceGetAttribute 0.08% 3.723us 0.08% 3.723us 0.248us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.15% 6.890us 0.53% 23.451us 7.817us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.37% 16.561us 0.37% 16.561us 5.520us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.50% 22.171us 0.50% 22.171us 2.463us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.09% 3.911us 0.09% 3.911us 1.304us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.61% 27.040us 0.61% 27.040us 9.013us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.17% 2.822ms 63.17% 2.822ms 2.822ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.467ms
Self CUDA time total: 2.974ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.41% 109.001us 36.55% 1.652ms 1.652ms 0.000us 0.00% 4.036ms 4.036ms 1
_flash_attn_9e27194::fwd 1.11% 50.180us 34.14% 1.543ms 514.365us 3.018ms 100.00% 4.036ms 1.345ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.019ms 100.05% 3.019ms 3.019ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.018ms 100.00% 3.018ms 1.006ms 3
Activity Buffer Request 31.22% 1.411ms 31.22% 1.411ms 1.411ms 1.018ms 33.73% 1.018ms 1.018ms 1
cudaDeviceGetAttribute 0.08% 3.790us 0.08% 3.790us 0.253us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.16% 7.151us 0.52% 23.401us 7.800us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.36% 16.250us 0.36% 16.250us 5.417us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.48% 21.660us 0.48% 21.660us 2.407us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.10% 4.380us 0.10% 4.380us 1.460us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.64% 28.812us 0.64% 28.812us 9.604us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.45% 2.868ms 63.45% 2.868ms 2.868ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.520ms
Self CUDA time total: 3.018ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.47% 118.264us 38.70% 1.854ms 1.854ms 0.000us 0.00% 4.130ms 4.130ms 1
_flash_attn_9e27194::fwd 1.01% 48.470us 36.23% 1.735ms 578.465us 3.094ms 100.00% 4.130ms 1.377ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.05% 3.096ms 3.096ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 100.00% 3.094ms 1.031ms 3
Activity Buffer Request 29.33% 1.405ms 29.33% 1.405ms 1.405ms 1.036ms 33.49% 1.036ms 1.036ms 1
cudaDeviceGetAttribute 0.08% 3.720us 0.08% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.16% 7.520us 0.53% 25.440us 8.480us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.37% 17.920us 0.37% 17.920us 5.973us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.43% 20.670us 0.43% 20.670us 2.297us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 4.010us 0.08% 4.010us 1.337us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.76% 227.935us 4.76% 227.935us 75.978us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 61.30% 2.937ms 61.30% 2.937ms 2.937ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.790ms
Self CUDA time total: 3.094ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.07% 110.462us 34.39% 1.835ms 1.835ms 0.000us 0.00% 4.876ms 4.876ms 1
_flash_attn_9e27194::fwd 0.91% 48.552us 32.32% 1.724ms 574.769us 3.652ms 100.00% 4.876ms 1.625ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.05% 3.654ms 3.654ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
Activity Buffer Request 27.00% 1.440ms 27.00% 1.440ms 1.440ms 1.224ms 33.53% 1.224ms 1.224ms 1
cudaDeviceGetAttribute 0.07% 3.831us 0.07% 3.831us 0.255us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.15% 7.880us 0.47% 24.970us 8.323us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.32% 17.090us 0.32% 17.090us 5.697us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.44% 23.410us 0.44% 23.410us 2.601us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 4.110us 0.08% 4.110us 1.370us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.36% 179.284us 3.36% 179.284us 59.761us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 65.61% 3.500ms 65.61% 3.500ms 3.500ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.335ms
Self CUDA time total: 3.652ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.06% 108.982us 33.74% 1.784ms 1.784ms 0.000us 0.00% 4.883ms 4.883ms 1
_flash_attn_9e27194::fwd 0.92% 48.842us 31.68% 1.675ms 558.369us 3.652ms 100.00% 4.883ms 1.628ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.04% 3.654ms 3.654ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
Activity Buffer Request 26.57% 1.405ms 26.57% 1.405ms 1.405ms 1.231ms 33.70% 1.231ms 1.231ms 1
cudaDeviceGetAttribute 0.07% 3.720us 0.07% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.14% 7.460us 0.45% 23.940us 7.980us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.31% 16.480us 0.31% 16.480us 5.493us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.43% 22.601us 0.43% 22.601us 2.511us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.610us 0.07% 3.610us 1.203us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.17% 167.603us 3.17% 167.603us 55.868us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 66.26% 3.504ms 66.26% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.288ms
Self CUDA time total: 3.652ms
impl wl p50(ms) ok
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
Fetching 20 files: 10%|█ | 2/20 [00:01<00:14, 1.21it/s]
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 12.08it/s]