{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17578399996409644, "p50": 0.17709399998011577, "p90": 0.17922400002134964, "mean": 0.17895179998959065, "iqr": 0.002560000041285093, "raw_times": [0.17666399998006455, 0.17709399998011577, 0.17578399996409644, 0.18599300000232688, 0.17922400002134964], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18588400001817718, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21852399999033878, "p50": 0.22123499996951068, "p90": 0.22281499997234278, "mean": 0.22667299998602175, "iqr": 0.0019999999949504854, "raw_times": [0.2208149999773923, 0.22123499996951068, 0.21852399999033878, 0.24997600002052422, 0.22281499997234278], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22185399996033084, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21726399995714019, "p50": 0.22378500000286294, "p90": 0.22635499999523745, "mean": 0.22464679999529835, "iqr": 0.0036000000136482413, "raw_times": [0.22635499999523745, 0.22378500000286294, 0.21726399995714019, 0.23307500003966197, 0.2227549999815892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22023499997203544, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21678499996369283, "p50": 0.2199049999944691, "p90": 0.22050500001569162, "mean": 0.21960279999575505, "iqr": 0.0022199999989425123, "raw_times": [0.22050500001569162, 0.2199049999944691, 0.2182850000167491, 0.22253399998817258, 0.21678499996369283], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23155500002758345, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21946399999706045, "p50": 0.22019400000772293, "p90": 0.22058499996546743, "mean": 0.2201885999852493, "iqr": 0.0005499999815583578, "raw_times": [0.22019400000772293, 0.22003499998390907, 0.22058499996546743, 0.21946399999706045, 0.22066499997208666], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22136500001579407, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.215785000023061, "p50": 0.21925499999042586, "p90": 0.22044500002493805, "mean": 0.22373300000708696, "iqr": 0.003450000008342613, "raw_times": [0.215785000023061, 0.22044500002493805, 0.21925499999042586, 0.24618499998041443, 0.21699500001659544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22207500001059088, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21516500004281625, "p50": 0.2195149999693058, "p90": 0.22164500001053966, "mean": 0.21926680001342902, "iqr": 0.005660999988776894, "raw_times": [0.22402500002272063, 0.2195149999693058, 0.21516500004281625, 0.21598400002176277, 0.22164500001053966], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21819500000219705, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2141339999752745, "p50": 0.218735000032666, "p90": 0.21932399999968766, "mean": 0.22093040000754627, "iqr": 0.0017599999750927964, "raw_times": [0.2141339999752745, 0.23489500000550834, 0.21756400002459486, 0.218735000032666, 0.21932399999968766], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22086500001705645, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21650500002579065, "p50": 0.21868400000357724, "p90": 0.21925499999042586, "mean": 0.22372079999968264, "iqr": 0.0009299999987888441, "raw_times": [0.21650500002579065, 0.24583499998698244, 0.21925499999042586, 0.21832499999163701, 0.21868400000357724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22242500000402288, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2184950000128083, "p50": 0.22323500002130459, "p90": 0.22841400004836032, "mean": 0.22448680001616594, "iqr": 0.008849000039390376, "raw_times": [0.22841400004836032, 0.22323500002130459, 0.2184950000128083, 0.21956500000896995, 0.23272499998938656], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22242500000402288, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21709500003908033, "p50": 0.22114500001180204, "p90": 0.22174499997618113, "mean": 0.22064500001306442, "iqr": 0.004549999971459329, "raw_times": [0.22174499997618113, 0.22114500001180204, 0.22604500003353678, 0.21709500003908033, 0.2171950000047218], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2633260000379778, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2232749999961925, "p50": 0.22420499999498134, "p90": 0.225494999995135, "mean": 0.22680499999978565, "iqr": 0.001499999996212864, "raw_times": [0.23705500001369728, 0.22399499999892214, 0.225494999995135, 0.22420499999498134, 0.2232749999961925], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22469399999636153, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21559499998602405, "p50": 0.2172250000285203, "p90": 0.21947499999441789, "mean": 0.2188926000030733, "iqr": 0.002500999983112706, "raw_times": [0.21559499998602405, 0.22519399999509915, 0.21697400001130518, 0.21947499999441789, 0.2172250000285203], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2516950000313045, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21588499998870248, "p50": 0.2187050000088675, "p90": 0.2197649999970963, "mean": 0.22497519998978532, "iqr": 0.0018900000213761814, "raw_times": [0.21787499997572013, 0.2526459999785402, 0.21588499998870248, 0.2187050000088675, 0.2197649999970963], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22343400002000635, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21829500002468194, "p50": 0.2230250000252454, "p90": 0.2236250000464679, "mean": 0.22705700001779405, "iqr": 0.0044400000547284435, "raw_times": [0.21829500002468194, 0.25115500000083557, 0.21918499999173946, 0.2230250000252454, 0.2236250000464679], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22709500001383276, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2149849999568687, "p50": 0.21960500004070127, "p90": 0.22131500003297333, "mean": 0.22512300001835683, "iqr": 0.0024400000029345392, "raw_times": [0.21960500004070127, 0.22131500003297333, 0.2149849999568687, 0.2188750000300388, 0.25083500003120207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22256500000139567, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21324499999764157, "p50": 0.21695399999543952, "p90": 0.22048499999982596, "mean": 0.21762459998626582, "iqr": 0.003631000026871334, "raw_times": [0.21685399997295463, 0.22058499996546743, 0.21695399999543952, 0.22048499999982596, 0.21324499999764157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2328750000515356, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21507499997142077, "p50": 0.21520500001770415, "p90": 0.21626500000593296, "mean": 0.2183889999969324, "iqr": 0.001121000025250396, "raw_times": [0.21626500000593296, 0.21507499997142077, 0.21520500001770415, 0.23025600000892155, 0.21514399998068257], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22121499995364502, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143149999938032, "p50": 0.21809500003655558, "p90": 0.21866499997713618, "mean": 0.2174867999997332, "iqr": 0.0025609999738662736, "raw_times": [0.2161040000032699, 0.2202549999879011, 0.21866499997713618, 0.2143149999938032, 0.21809500003655558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23317500000530345, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21576399996092732, "p50": 0.21857400003000294, "p90": 0.2222250000158965, "mean": 0.22089439999035676, "iqr": 0.004881000052137097, "raw_times": [0.21576399996092732, 0.23056499998119762, 0.21857400003000294, 0.2222250000158965, 0.21734399996375942], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22117399998933251, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.213884999993752, "p50": 0.21784400001934046, "p90": 0.21903500004327725, "mean": 0.2176128000087374, "iqr": 0.001270000041131425, "raw_times": [0.213884999993752, 0.21784400001934046, 0.21776500000214583, 0.21953499998517145, 0.21903500004327725], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22340499998563246, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21709399999281231, "p50": 0.2186049999863826, "p90": 0.21865499996920335, "mean": 0.22543699998323063, "iqr": 0.0004899999908047903, "raw_times": [0.21816499997839855, 0.25466599998935635, 0.21709399999281231, 0.2186049999863826, 0.21865499996920335], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22284399994987325, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2291549999995368, "p50": 0.23022499999569845, "p90": 0.2316450000421355, "mean": 0.234377000015229, "iqr": 0.0017800000478018774, "raw_times": [0.2291549999995368, 0.2316450000421355, 0.23022499999569845, 0.25099500004444053, 0.22986499999433363], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23142500003814348, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6352529999844592, "p50": 0.6405139999969833, "p90": 0.6429430000025604, "mean": 0.6394775999979174, "iqr": 0.007369000002199755, "raw_times": [0.6405139999969833, 0.6352529999844592, 0.6431040000052235, 0.6355740000003607, 0.6429430000025604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6388340000285098, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}