{"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2110259999644768, "p50": 1.2139859999820146, "p90": 1.214847000028385, "mean": 1.2134921999972903, "iqr": 0.002731000051880983, "raw_times": [1.2139859999820146, 1.212115999976504, 1.215486000035071, 1.214847000028385, 1.2110259999644768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2041449999742326, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2489469999650282, "p50": 1.2594169999715632, "p90": 1.2745669999958409, "mean": 1.2628269999936492, "iqr": 0.020920999986628885, "raw_times": [1.2489469999650282, 1.2594169999715632, 1.253646000009212, 1.2745669999958409, 1.2775580000266018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2641869999470146, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2830869999902461, "p50": 1.288437999960479, "p90": 1.2899880000531994, "mean": 1.287595600001623, "iqr": 0.0053310000680539815, "raw_times": [1.2846569999851454, 1.2899880000531994, 1.288437999960479, 1.2918080000190457, 1.2830869999902461], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2736869999798728, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3104980000093747, "p50": 1.3190589999680924, "p90": 1.3191280000341976, "mean": 1.3194864000070083, "iqr": 0.002650000055837154, "raw_times": [1.3104980000093747, 1.3322690000450166, 1.3190589999680924, 1.3164779999783605, 1.3191280000341976], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3112579999869922, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.470441000037681, "p50": 1.4753519999999298, "p90": 1.4777020000451557, "mean": 1.4751576000094246, "iqr": 0.00719000007620707, "raw_times": [1.4777020000451557, 1.4753519999999298, 1.481780999995408, 1.470441000037681, 1.4705119999689487], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.4788519999910932, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4914220000150635, "p50": 1.5084219999721427, "p90": 1.5101930000014363, "mean": 1.5035721999993257, "iqr": 0.01671100000066872, "raw_times": [1.4914220000150635, 1.4934820000007676, 1.5084219999721427, 1.5101930000014363, 1.5143420000072183], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.497162000021035, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}