From 8f589a4af351aa3f247f866eefaf336a50cdaebb Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 05:38:23 -0700 Subject: [PATCH 1/9] update sam script --- benchmarks/benchmark_sam.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/benchmarks/benchmark_sam.py b/benchmarks/benchmark_sam.py index 14c2d8bc5e..5927b4f99e 100644 --- a/benchmarks/benchmark_sam.py +++ b/benchmarks/benchmark_sam.py @@ -1,6 +1,8 @@ +from itertools import product + import pandas as pd +from segment_anything_fast import sam_model_registry import torch -from segment_anything import sam_model_registry from torch.utils.benchmark import Timer from torch.sparse import SparseSemiStructuredTensor, SparseSemiStructuredTensorCUTLASS, SparseSemiStructuredTensorCUSPARSELT from torchao.quantization.quant_api import ( @@ -16,7 +18,6 @@ apply_fake_sparsity, ) from torchao.sparsity.prototype.dynamic_quant_sparse import Int8DynamicallyQuantized24CusparseltLinearFuseMulWeight, Int8DynamicallyQuantizedSemiStructuredSparseLinearWeight -from itertools import product from tqdm import tqdm sam_checkpoint_base_path = "/home/jessecai/local/MODELS" @@ -112,18 +113,20 @@ def run_once(block_only=False, dtype=torch.bfloat16, batchsize=32, compile=True, if __name__ == "__main__": print("BENCHMARKING") - ALL_RUNS = [run_once(qkv="quant+sparse (cutlass)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)")] - # for option in tqdm(SUBCLASSES)] - # ALL_RUNS = [ - # run_once(), - # run_once(qkv="quant", proj="quant", lin1="quant", lin2="quant"), - # run_once(qkv="quant+sparse (cusparselt)", proj="quant+sparse (cusparselt)", lin1="quant+sparse (cusparselt)", lin2="quant+sparse (cutlass)"), - # run_once(qkv="quant+sparse (cusparselt)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)"), - # run_once(qkv="quant", proj="quant", lin1="quant+sparse (cusparselt)", lin2="quant+sparse (cusparselt)"), - # run_once(qkv="sparse (cusparselt)", proj="sparse (cusparselt)", lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"), - # run_once(qkv="sparse (cutlass)", proj="sparse (cutlass)", lin1="sparse (cutlass)", lin2="sparse (cutlass)"), - # run_once(qkv="quant+sparse (cutlass)", proj="quant+sparse (cutlass)", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)"), - # ] + # ALL_RUNS = [run_once(qkv="quant+sparse (cutlass)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)")] + + ALL_RUNS = [ + run_once(), + run_once(lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"), + run_once(lin1="sparse (cutlass)", lin2="sparse (cutlass)"), + run_once(qkv="sparse (cusparselt)", proj="sparse (cusparselt)", lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"), + run_once(qkv="sparse (cutlass)", proj="sparse (cutlass)", lin1="sparse (cutlass)", lin2="sparse (cutlass)"), + # run_once(qkv="quant", proj="quant", lin1="quant", lin2="quant"), + # run_once(qkv="quant+sparse (cusparselt)", proj="quant+sparse (cusparselt)", lin1="quant+sparse (cusparselt)", lin2="quant+sparse (cutlass)"), + # run_once(qkv="quant+sparse (cusparselt)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)"), + # run_once(qkv="quant", proj="quant", lin1="quant+sparse (cusparselt)", lin2="quant+sparse (cusparselt)"), + # run_once(qkv="quant+sparse (cutlass)", proj="quant+sparse (cutlass)", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)"), + ] df = pd.DataFrame(ALL_RUNS) df.to_csv("sam_benchmark_results.csv") print(df) From 243479c43623966a08c57f510befc81b3e21151a Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 06:11:37 -0700 Subject: [PATCH 2/9] updated readme with results --- benchmarks/benchmark_sam.py | 15 +++++++++------ torchao/sparsity/README.md | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/benchmarks/benchmark_sam.py b/benchmarks/benchmark_sam.py index 5927b4f99e..1677b24ee7 100644 --- a/benchmarks/benchmark_sam.py +++ b/benchmarks/benchmark_sam.py @@ -1,3 +1,4 @@ +import argparse from itertools import product import pandas as pd @@ -113,14 +114,16 @@ def run_once(block_only=False, dtype=torch.bfloat16, batchsize=32, compile=True, if __name__ == "__main__": print("BENCHMARKING") + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('--eager', action='store_true', help='enable/disable torch.compile') + args = parser.parse_args() # ALL_RUNS = [run_once(qkv="quant+sparse (cutlass)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)")] - ALL_RUNS = [ - run_once(), - run_once(lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"), - run_once(lin1="sparse (cutlass)", lin2="sparse (cutlass)"), - run_once(qkv="sparse (cusparselt)", proj="sparse (cusparselt)", lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"), - run_once(qkv="sparse (cutlass)", proj="sparse (cutlass)", lin1="sparse (cutlass)", lin2="sparse (cutlass)"), + run_once(compile=not args.eager), + run_once(compile=not args.eager, lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"), + run_once(compile=not args.eager, lin1="sparse (cutlass)", lin2="sparse (cutlass)"), + run_once(compile=not args.eager, qkv="sparse (cusparselt)", proj="sparse (cusparselt)", lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"), + run_once(compile=not args.eager, qkv="sparse (cutlass)", proj="sparse (cutlass)", lin1="sparse (cutlass)", lin2="sparse (cutlass)"), # run_once(qkv="quant", proj="quant", lin1="quant", lin2="quant"), # run_once(qkv="quant+sparse (cusparselt)", proj="quant+sparse (cusparselt)", lin1="quant+sparse (cusparselt)", lin2="quant+sparse (cutlass)"), # run_once(qkv="quant+sparse (cusparselt)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)"), diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index 49cbe51a13..8d6a10dbb4 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -18,6 +18,37 @@ More concretely, we hope to provide tutorials and APIs for both sparse kernels ( 2. Recover accuracy loss of pruned model with custom pruning algorthim. 3. Accelerate masked/pruned models on sparsity-supported hardware to realize performance improvements. +## Success Stories + +#### segment-anything +We applied 2:4 sparsity to accelerate segment-anything, as part of [segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast). +The results mentioned in the REAADME of the repo compose sparsity with a suite of other inference acceleration techniques. + +From our benchmarking, we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled. + +``` +python benchmarks/benchmark_sam.py + + block_only batchsize dtype compile qkv proj lin1 lin2 time memory img/s +0 False 32 torch.bfloat16 True None None None None 1361.733349 15.808660 23.499461 +1 False 32 torch.bfloat16 True None None sparse (cusparselt) sparse (cusparselt) 1245.151100 15.462827 25.699692 +2 False 32 torch.bfloat16 True None None sparse (cutlass) sparse (cutlass) 1251.047651 15.411250 25.578562 +3 False 32 torch.bfloat16 True sparse (cusparselt) sparse (cusparselt) sparse (cusparselt) sparse (cusparselt) 1265.426255 12.705007 25.287922 +4 False 32 torch.bfloat16 True sparse (cutlass) sparse (cutlass) sparse (cutlass) sparse (cutlass) 1274.955840 12.704523 25.098909 +``` + +#### BERT + +We were able to accelerate BERT 1.23x with a negligible accuracy drop on SQuAD. +For more information about accelerting BERT with semi-sturcutred sparsity, please see our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta). + +| Metrics | fp16 | 2:4 sparse | delta / speedup | +| --- | --- | --- | --- | --- | +| Exact Match (%) | 78.53 | 78.44 | -0.09 | +| F1 (%) | 86.93 | 86.49 | -0.44 | +| Time (bs=16) | 19.35 | 15.74 | 1.23x | + + # Design Sparsity, like quantization, is an accuracy/performance trade-off, where we care not only about the speedup but also on the accuracy degradation of our architecture optimization technique. From 94c2ecbe3efe277c29a23629cf5cba13cbf1297e Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 06:13:45 -0700 Subject: [PATCH 3/9] fix formatting --- torchao/sparsity/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index 8d6a10dbb4..1b9a0af527 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -43,7 +43,7 @@ We were able to accelerate BERT 1.23x with a negligible accuracy drop on SQuAD. For more information about accelerting BERT with semi-sturcutred sparsity, please see our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta). | Metrics | fp16 | 2:4 sparse | delta / speedup | -| --- | --- | --- | --- | --- | +| --- | --- | --- | --- | | Exact Match (%) | 78.53 | 78.44 | -0.09 | | F1 (%) | 86.93 | 86.49 | -0.44 | | Time (bs=16) | 19.35 | 15.74 | 1.23x | From a530f78e93166bc9600bf6df05d652362b640cd1 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 07:51:16 -0700 Subject: [PATCH 4/9] cr feedback --- benchmarks/benchmark_sam.py | 2 ++ torchao/sparsity/README.md | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/benchmarks/benchmark_sam.py b/benchmarks/benchmark_sam.py index 1677b24ee7..3a5d817602 100644 --- a/benchmarks/benchmark_sam.py +++ b/benchmarks/benchmark_sam.py @@ -2,6 +2,8 @@ from itertools import product import pandas as pd +# to install segment-anything-fast you can run: +# pip install git+https://github.com/pytorch-labs/segment-anything-fast.git from segment_anything_fast import sam_model_registry import torch from torch.utils.benchmark import Timer diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index 1b9a0af527..ce8724320e 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -22,24 +22,26 @@ More concretely, we hope to provide tutorials and APIs for both sparse kernels ( #### segment-anything We applied 2:4 sparsity to accelerate segment-anything, as part of [segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast). -The results mentioned in the REAADME of the repo compose sparsity with a suite of other inference acceleration techniques. +The results mentioned in the README of the repo compose sparsity with a suite of other inference acceleration techniques. -From our benchmarking, we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled. +From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled. +To reproduce these benchmarks you can run the following command: ``` python benchmarks/benchmark_sam.py - - block_only batchsize dtype compile qkv proj lin1 lin2 time memory img/s -0 False 32 torch.bfloat16 True None None None None 1361.733349 15.808660 23.499461 -1 False 32 torch.bfloat16 True None None sparse (cusparselt) sparse (cusparselt) 1245.151100 15.462827 25.699692 -2 False 32 torch.bfloat16 True None None sparse (cutlass) sparse (cutlass) 1251.047651 15.411250 25.578562 -3 False 32 torch.bfloat16 True sparse (cusparselt) sparse (cusparselt) sparse (cusparselt) sparse (cusparselt) 1265.426255 12.705007 25.287922 -4 False 32 torch.bfloat16 True sparse (cutlass) sparse (cutlass) sparse (cutlass) sparse (cutlass) 1274.955840 12.704523 25.098909 ``` +| block_only | batchsize | dtype | compile | qkv | proj | lin1 | lin2 | time | memory | img/s | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| False | 32 | torch.bfloat16 | True | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 | +| False | 32 | torch.bfloat16 | True | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 | +| False | 32 | torch.bfloat16 | True | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 | +| False | 32 | torch.bfloat16 | True | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 | +| False | 32 | torch.bfloat16 | True | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 | + #### BERT -We were able to accelerate BERT 1.23x with a negligible accuracy drop on SQuAD. +We were able to accelerate BERT 1.23x on an A100 with a negligible accuracy drop on SQuAD. For more information about accelerting BERT with semi-sturcutred sparsity, please see our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta). | Metrics | fp16 | 2:4 sparse | delta / speedup | From e555fca5054275167838c20f9aed0bc4ab15c0d2 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 07:54:40 -0700 Subject: [PATCH 5/9] more cr feedback --- torchao/sparsity/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index ce8724320e..7cef6ffc6c 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -27,12 +27,15 @@ The results mentioned in the README of the repo compose sparsity with a suite of From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled. To reproduce these benchmarks you can run the following command: +The inference acceleration of semi-structured sparsity depends on the matmul shapes, which is why we don't see additional speedups when applying to all linear layers (attn + mlp) of segment-anything. +We find that accelerating the MLP linear layers provied the most speedups (`lin1`, `lin2`). To repoduce our benchmarks you can run the following command: + ``` python benchmarks/benchmark_sam.py ``` | block_only | batchsize | dtype | compile | qkv | proj | lin1 | lin2 | time | memory | img/s | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| ---------- | --------- | ----- | ------- | --- | ---- | ---- | ---- | ---- | ------ | ----- | | False | 32 | torch.bfloat16 | True | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 | | False | 32 | torch.bfloat16 | True | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 | | False | 32 | torch.bfloat16 | True | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 | From 6df54564f570d1d14b9aa3867118246b9fb26ea6 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 07:56:58 -0700 Subject: [PATCH 6/9] more cr feedback --- torchao/sparsity/README.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index 7cef6ffc6c..980632e359 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -34,13 +34,15 @@ We find that accelerating the MLP linear layers provied the most speedups (`lin1 python benchmarks/benchmark_sam.py ``` -| block_only | batchsize | dtype | compile | qkv | proj | lin1 | lin2 | time | memory | img/s | -| ---------- | --------- | ----- | ------- | --- | ---- | ---- | ---- | ---- | ------ | ----- | -| False | 32 | torch.bfloat16 | True | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 | -| False | 32 | torch.bfloat16 | True | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 | -| False | 32 | torch.bfloat16 | True | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 | -| False | 32 | torch.bfloat16 | True | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 | -| False | 32 | torch.bfloat16 | True | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 | +The following benchmarks we run on an A100, with batch_size=32 and `bfloat16` dtype: + +| qkv | proj | lin1 | lin2 | time | memory | img/s | +| ---- | ---- | ---- | ---- | ------ | ----- | +| None | None | None | None | 1361.733349 | 15.808660 | 23.499461 | +| None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 | +| None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 | +| sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 | +| sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 | #### BERT From de207619e244ec0a7b8b8c4023cbca66a78a1638 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 07:57:56 -0700 Subject: [PATCH 7/9] more cr feedback --- torchao/sparsity/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index 980632e359..238378b98b 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -36,8 +36,8 @@ python benchmarks/benchmark_sam.py The following benchmarks we run on an A100, with batch_size=32 and `bfloat16` dtype: -| qkv | proj | lin1 | lin2 | time | memory | img/s | -| ---- | ---- | ---- | ---- | ------ | ----- | +| qkv | proj | lin1 | lin2 | time | memory | img/s | +| ---- | ---- | ---- | ---- | ---- | ------ | ----- | | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 | | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 | | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 | From 1f265c4bd78f67816b472fb7a799898f144824dd Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 07:58:33 -0700 Subject: [PATCH 8/9] add code ticks --- torchao/sparsity/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index 238378b98b..fc116e0a2c 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -24,7 +24,7 @@ More concretely, we hope to provide tutorials and APIs for both sparse kernels ( We applied 2:4 sparsity to accelerate segment-anything, as part of [segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast). The results mentioned in the README of the repo compose sparsity with a suite of other inference acceleration techniques. -From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled. +From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with `SEGMENT_ANYTHING_FAST_USE_FLASH_4` enabled. To reproduce these benchmarks you can run the following command: The inference acceleration of semi-structured sparsity depends on the matmul shapes, which is why we don't see additional speedups when applying to all linear layers (attn + mlp) of segment-anything. From 1fbea59e1dd5fa7a914a2a941ef1d9c4ab811507 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Mon, 3 Jun 2024 08:00:07 -0700 Subject: [PATCH 9/9] sigfigs --- torchao/sparsity/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md index fc116e0a2c..bc0a61b202 100644 --- a/torchao/sparsity/README.md +++ b/torchao/sparsity/README.md @@ -38,11 +38,11 @@ The following benchmarks we run on an A100, with batch_size=32 and `bfloat16` dt | qkv | proj | lin1 | lin2 | time | memory | img/s | | ---- | ---- | ---- | ---- | ---- | ------ | ----- | -| None | None | None | None | 1361.733349 | 15.808660 | 23.499461 | -| None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 | -| None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 | -| sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 | -| sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 | +| None | None | None | None | 1361.73 | 15.81 | 23.50 | +| None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.15 | 15.46 | 25.70 | +| None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.41 | 25.59 | +| sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.43 | 12.71 | 25.29| +| sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.96 | 12.70 | 25.10 | #### BERT