From 8f589a4af351aa3f247f866eefaf336a50cdaebb Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 05:38:23 -0700
Subject: [PATCH 1/9] update sam script

---
 benchmarks/benchmark_sam.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/benchmarks/benchmark_sam.py b/benchmarks/benchmark_sam.py
index 14c2d8bc5e..5927b4f99e 100644
--- a/benchmarks/benchmark_sam.py
+++ b/benchmarks/benchmark_sam.py
@@ -1,6 +1,8 @@
+from itertools import product
+
 import pandas as pd
+from segment_anything_fast import sam_model_registry
 import torch
-from segment_anything import sam_model_registry
 from torch.utils.benchmark import Timer
 from torch.sparse import SparseSemiStructuredTensor, SparseSemiStructuredTensorCUTLASS, SparseSemiStructuredTensorCUSPARSELT
 from torchao.quantization.quant_api import (
@@ -16,7 +18,6 @@
     apply_fake_sparsity,
 )
 from torchao.sparsity.prototype.dynamic_quant_sparse import Int8DynamicallyQuantized24CusparseltLinearFuseMulWeight, Int8DynamicallyQuantizedSemiStructuredSparseLinearWeight
-from itertools import product
 from tqdm import tqdm
 
 sam_checkpoint_base_path = "/home/jessecai/local/MODELS"
@@ -112,18 +113,20 @@ def run_once(block_only=False, dtype=torch.bfloat16, batchsize=32, compile=True,
 
 if __name__ == "__main__":
     print("BENCHMARKING")
-    ALL_RUNS = [run_once(qkv="quant+sparse (cutlass)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)")]
-                # for option in tqdm(SUBCLASSES)]
-    # ALL_RUNS = [
-    #     run_once(),
-    #     run_once(qkv="quant",                     proj="quant",                     lin1="quant",                        lin2="quant"),
-    #     run_once(qkv="quant+sparse (cusparselt)", proj="quant+sparse (cusparselt)", lin1="quant+sparse (cusparselt)",    lin2="quant+sparse (cutlass)"),
-    #     run_once(qkv="quant+sparse (cusparselt)", proj="quant",                     lin1="quant+sparse (cutlass)",       lin2="quant+sparse (cutlass)"),
-    #     run_once(qkv="quant",                     proj="quant",                     lin1="quant+sparse (cusparselt)",    lin2="quant+sparse (cusparselt)"),
-    #     run_once(qkv="sparse (cusparselt)",       proj="sparse (cusparselt)",       lin1="sparse (cusparselt)",          lin2="sparse (cusparselt)"),
-    #     run_once(qkv="sparse (cutlass)",          proj="sparse (cutlass)",          lin1="sparse (cutlass)",             lin2="sparse (cutlass)"),
-    #     run_once(qkv="quant+sparse (cutlass)",    proj="quant+sparse (cutlass)",    lin1="quant+sparse (cutlass)",       lin2="quant+sparse (cutlass)"),
-    # ]
+    # ALL_RUNS = [run_once(qkv="quant+sparse (cutlass)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)")]
+
+    ALL_RUNS = [
+        run_once(),
+        run_once(lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"),
+        run_once(lin1="sparse (cutlass)", lin2="sparse (cutlass)"),
+        run_once(qkv="sparse (cusparselt)",       proj="sparse (cusparselt)",       lin1="sparse (cusparselt)",          lin2="sparse (cusparselt)"),
+        run_once(qkv="sparse (cutlass)",          proj="sparse (cutlass)",          lin1="sparse (cutlass)",             lin2="sparse (cutlass)"),
+        # run_once(qkv="quant",                     proj="quant",                     lin1="quant",                        lin2="quant"),
+        # run_once(qkv="quant+sparse (cusparselt)", proj="quant+sparse (cusparselt)", lin1="quant+sparse (cusparselt)",    lin2="quant+sparse (cutlass)"),
+        # run_once(qkv="quant+sparse (cusparselt)", proj="quant",                     lin1="quant+sparse (cutlass)",       lin2="quant+sparse (cutlass)"),
+        # run_once(qkv="quant",                     proj="quant",                     lin1="quant+sparse (cusparselt)",    lin2="quant+sparse (cusparselt)"),
+        # run_once(qkv="quant+sparse (cutlass)",    proj="quant+sparse (cutlass)",    lin1="quant+sparse (cutlass)",       lin2="quant+sparse (cutlass)"),
+    ]
     df = pd.DataFrame(ALL_RUNS)
     df.to_csv("sam_benchmark_results.csv")
     print(df)

From 243479c43623966a08c57f510befc81b3e21151a Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 06:11:37 -0700
Subject: [PATCH 2/9] updated readme with results

---
 benchmarks/benchmark_sam.py | 15 +++++++++------
 torchao/sparsity/README.md  | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmark_sam.py b/benchmarks/benchmark_sam.py
index 5927b4f99e..1677b24ee7 100644
--- a/benchmarks/benchmark_sam.py
+++ b/benchmarks/benchmark_sam.py
@@ -1,3 +1,4 @@
+import argparse
 from itertools import product
 
 import pandas as pd
@@ -113,14 +114,16 @@ def run_once(block_only=False, dtype=torch.bfloat16, batchsize=32, compile=True,
 
 if __name__ == "__main__":
     print("BENCHMARKING")
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('--eager', action='store_true', help='enable/disable torch.compile')
+    args = parser.parse_args()
     # ALL_RUNS = [run_once(qkv="quant+sparse (cutlass)", proj="quant", lin1="quant+sparse (cutlass)", lin2="quant+sparse (cutlass)")]
-
     ALL_RUNS = [
-        run_once(),
-        run_once(lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"),
-        run_once(lin1="sparse (cutlass)", lin2="sparse (cutlass)"),
-        run_once(qkv="sparse (cusparselt)",       proj="sparse (cusparselt)",       lin1="sparse (cusparselt)",          lin2="sparse (cusparselt)"),
-        run_once(qkv="sparse (cutlass)",          proj="sparse (cutlass)",          lin1="sparse (cutlass)",             lin2="sparse (cutlass)"),
+        run_once(compile=not args.eager),
+        run_once(compile=not args.eager, lin1="sparse (cusparselt)", lin2="sparse (cusparselt)"),
+        run_once(compile=not args.eager, lin1="sparse (cutlass)", lin2="sparse (cutlass)"),
+        run_once(compile=not args.eager, qkv="sparse (cusparselt)",       proj="sparse (cusparselt)",       lin1="sparse (cusparselt)",          lin2="sparse (cusparselt)"),
+        run_once(compile=not args.eager, qkv="sparse (cutlass)",          proj="sparse (cutlass)",          lin1="sparse (cutlass)",             lin2="sparse (cutlass)"),
         # run_once(qkv="quant",                     proj="quant",                     lin1="quant",                        lin2="quant"),
         # run_once(qkv="quant+sparse (cusparselt)", proj="quant+sparse (cusparselt)", lin1="quant+sparse (cusparselt)",    lin2="quant+sparse (cutlass)"),
         # run_once(qkv="quant+sparse (cusparselt)", proj="quant",                     lin1="quant+sparse (cutlass)",       lin2="quant+sparse (cutlass)"),
diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index 49cbe51a13..8d6a10dbb4 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -18,6 +18,37 @@ More concretely, we hope to provide tutorials and APIs for both sparse kernels (
 2. Recover accuracy loss of pruned model with custom pruning algorthim.
 3. Accelerate masked/pruned models on sparsity-supported hardware to realize performance improvements.
 
+## Success Stories
+
+#### segment-anything
+We applied 2:4 sparsity to accelerate segment-anything, as part of [segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast).
+The results mentioned in the REAADME of the repo compose sparsity with a suite of other inference acceleration techniques.
+
+From our benchmarking, we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled.
+
+```
+python benchmarks/benchmark_sam.py
+
+   block_only  batchsize           dtype  compile                  qkv                 proj                 lin1                 lin2         time     memory      img/s
+0       False         32  torch.bfloat16     True                 None                 None                 None                 None  1361.733349  15.808660  23.499461
+1       False         32  torch.bfloat16     True                 None                 None  sparse (cusparselt)  sparse (cusparselt)  1245.151100  15.462827  25.699692
+2       False         32  torch.bfloat16     True                 None                 None     sparse (cutlass)     sparse (cutlass)  1251.047651  15.411250  25.578562
+3       False         32  torch.bfloat16     True  sparse (cusparselt)  sparse (cusparselt)  sparse (cusparselt)  sparse (cusparselt)  1265.426255  12.705007  25.287922
+4       False         32  torch.bfloat16     True     sparse (cutlass)     sparse (cutlass)     sparse (cutlass)     sparse (cutlass)  1274.955840  12.704523  25.098909
+```
+
+#### BERT
+
+We were able to accelerate BERT 1.23x with a negligible accuracy drop on SQuAD.
+For more information about accelerting BERT with semi-sturcutred sparsity, please see our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta).
+
+| Metrics | fp16 | 2:4 sparse | delta / speedup |
+| --- | --- | --- | --- | --- |
+| Exact Match (%) | 78.53 | 78.44 | -0.09 |
+| F1 (%) | 86.93 | 86.49 | -0.44 |
+| Time (bs=16) | 19.35 | 15.74 | 1.23x |
+
+
 # Design
 
 Sparsity, like quantization, is an accuracy/performance trade-off, where we care not only about the speedup but also on the accuracy degradation of our architecture optimization technique.

From 94c2ecbe3efe277c29a23629cf5cba13cbf1297e Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 06:13:45 -0700
Subject: [PATCH 3/9] fix formatting

---
 torchao/sparsity/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index 8d6a10dbb4..1b9a0af527 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -43,7 +43,7 @@ We were able to accelerate BERT 1.23x with a negligible accuracy drop on SQuAD.
 For more information about accelerting BERT with semi-sturcutred sparsity, please see our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta).
 
 | Metrics | fp16 | 2:4 sparse | delta / speedup |
-| --- | --- | --- | --- | --- |
+| --- | --- | --- | --- |
 | Exact Match (%) | 78.53 | 78.44 | -0.09 |
 | F1 (%) | 86.93 | 86.49 | -0.44 |
 | Time (bs=16) | 19.35 | 15.74 | 1.23x |

From a530f78e93166bc9600bf6df05d652362b640cd1 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 07:51:16 -0700
Subject: [PATCH 4/9] cr feedback

---
 benchmarks/benchmark_sam.py |  2 ++
 torchao/sparsity/README.md  | 22 ++++++++++++----------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/benchmarks/benchmark_sam.py b/benchmarks/benchmark_sam.py
index 1677b24ee7..3a5d817602 100644
--- a/benchmarks/benchmark_sam.py
+++ b/benchmarks/benchmark_sam.py
@@ -2,6 +2,8 @@
 from itertools import product
 
 import pandas as pd
+# to install segment-anything-fast you can run:
+# pip install git+https://github.com/pytorch-labs/segment-anything-fast.git
 from segment_anything_fast import sam_model_registry
 import torch
 from torch.utils.benchmark import Timer
diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index 1b9a0af527..ce8724320e 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -22,24 +22,26 @@ More concretely, we hope to provide tutorials and APIs for both sparse kernels (
 
 #### segment-anything
 We applied 2:4 sparsity to accelerate segment-anything, as part of [segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast).
-The results mentioned in the REAADME of the repo compose sparsity with a suite of other inference acceleration techniques.
+The results mentioned in the README of the repo compose sparsity with a suite of other inference acceleration techniques.
 
-From our benchmarking, we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled.
+From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled.
+To reproduce these benchmarks you can run the following command:
 
 ```
 python benchmarks/benchmark_sam.py
-
-   block_only  batchsize           dtype  compile                  qkv                 proj                 lin1                 lin2         time     memory      img/s
-0       False         32  torch.bfloat16     True                 None                 None                 None                 None  1361.733349  15.808660  23.499461
-1       False         32  torch.bfloat16     True                 None                 None  sparse (cusparselt)  sparse (cusparselt)  1245.151100  15.462827  25.699692
-2       False         32  torch.bfloat16     True                 None                 None     sparse (cutlass)     sparse (cutlass)  1251.047651  15.411250  25.578562
-3       False         32  torch.bfloat16     True  sparse (cusparselt)  sparse (cusparselt)  sparse (cusparselt)  sparse (cusparselt)  1265.426255  12.705007  25.287922
-4       False         32  torch.bfloat16     True     sparse (cutlass)     sparse (cutlass)     sparse (cutlass)     sparse (cutlass)  1274.955840  12.704523  25.098909
 ```
 
+| block_only | batchsize | dtype | compile | qkv | proj | lin1 | lin2 | time | memory | img/s |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| False | 32 | torch.bfloat16 | True | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 |
+| False | 32 | torch.bfloat16 | True | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 |
+| False | 32 | torch.bfloat16 | True | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 |
+| False | 32 | torch.bfloat16 | True | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 |
+| False | 32 | torch.bfloat16 | True | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 |
+
 #### BERT
 
-We were able to accelerate BERT 1.23x with a negligible accuracy drop on SQuAD.
+We were able to accelerate BERT 1.23x on an A100 with a negligible accuracy drop on SQuAD.
 For more information about accelerting BERT with semi-sturcutred sparsity, please see our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta).
 
 | Metrics | fp16 | 2:4 sparse | delta / speedup |

From e555fca5054275167838c20f9aed0bc4ab15c0d2 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 07:54:40 -0700
Subject: [PATCH 5/9] more cr feedback

---
 torchao/sparsity/README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index ce8724320e..7cef6ffc6c 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -27,12 +27,15 @@ The results mentioned in the README of the repo compose sparsity with a suite of
 From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled.
 To reproduce these benchmarks you can run the following command:
 
+The inference acceleration of semi-structured sparsity depends on the matmul shapes, which is why we don't see additional speedups when applying to all linear layers (attn + mlp) of segment-anything.
+We find that accelerating the MLP linear layers provied the most speedups (`lin1`, `lin2`). To repoduce our benchmarks you can run the following command:
+
 ```
 python benchmarks/benchmark_sam.py
 ```
 
 | block_only | batchsize | dtype | compile | qkv | proj | lin1 | lin2 | time | memory | img/s |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| ---------- | --------- | ----- | ------- | --- | ---- | ---- | ---- | ---- | ------ | ----- |
 | False | 32 | torch.bfloat16 | True | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 |
 | False | 32 | torch.bfloat16 | True | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 |
 | False | 32 | torch.bfloat16 | True | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 |

From 6df54564f570d1d14b9aa3867118246b9fb26ea6 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 07:56:58 -0700
Subject: [PATCH 6/9] more cr feedback

---
 torchao/sparsity/README.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index 7cef6ffc6c..980632e359 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -34,13 +34,15 @@ We find that accelerating the MLP linear layers provied the most speedups (`lin1
 python benchmarks/benchmark_sam.py
 ```
 
-| block_only | batchsize | dtype | compile | qkv | proj | lin1 | lin2 | time | memory | img/s |
-| ---------- | --------- | ----- | ------- | --- | ---- | ---- | ---- | ---- | ------ | ----- |
-| False | 32 | torch.bfloat16 | True | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 |
-| False | 32 | torch.bfloat16 | True | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 |
-| False | 32 | torch.bfloat16 | True | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 |
-| False | 32 | torch.bfloat16 | True | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 |
-| False | 32 | torch.bfloat16 | True | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 |
+The following benchmarks we run on an A100, with batch_size=32 and `bfloat16` dtype:
+
+| qkv | proj | lin1 | lin2 | time | memory | img/s |
+| ---- | ---- | ---- | ---- | ------ | ----- |
+| None | None | None | None | 1361.733349 | 15.808660 | 23.499461 |
+| None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 |
+| None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 |
+| sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 |
+| sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 |
 
 #### BERT
 

From de207619e244ec0a7b8b8c4023cbca66a78a1638 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 07:57:56 -0700
Subject: [PATCH 7/9] more cr feedback

---
 torchao/sparsity/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index 980632e359..238378b98b 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -36,8 +36,8 @@ python benchmarks/benchmark_sam.py
 
 The following benchmarks we run on an A100, with batch_size=32 and `bfloat16` dtype:
 
-| qkv | proj | lin1 | lin2 | time | memory | img/s |
-| ---- | ---- | ---- | ---- | ------ | ----- |
+| qkv  | proj | lin1 | lin2 | time | memory | img/s |
+| ---- | ---- | ---- | ---- | ---- | ------ | ----- |
 | None | None | None | None | 1361.733349 | 15.808660 | 23.499461 |
 | None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 |
 | None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 |

From 1f265c4bd78f67816b472fb7a799898f144824dd Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 07:58:33 -0700
Subject: [PATCH 8/9] add code ticks

---
 torchao/sparsity/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index 238378b98b..fc116e0a2c 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -24,7 +24,7 @@ More concretely, we hope to provide tutorials and APIs for both sparse kernels (
 We applied 2:4 sparsity to accelerate segment-anything, as part of [segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast).
 The results mentioned in the README of the repo compose sparsity with a suite of other inference acceleration techniques.
 
-From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with SEGMENT_ANYTHING_FAST_USE_FLASH_4 enabled.
+From our [benchmarking](https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_sam.py), we see a 1.1x speedup when running with `SEGMENT_ANYTHING_FAST_USE_FLASH_4` enabled.
 To reproduce these benchmarks you can run the following command:
 
 The inference acceleration of semi-structured sparsity depends on the matmul shapes, which is why we don't see additional speedups when applying to all linear layers (attn + mlp) of segment-anything.

From 1fbea59e1dd5fa7a914a2a941ef1d9c4ab811507 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 3 Jun 2024 08:00:07 -0700
Subject: [PATCH 9/9] sigfigs

---
 torchao/sparsity/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index fc116e0a2c..bc0a61b202 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -38,11 +38,11 @@ The following benchmarks we run on an A100, with batch_size=32 and `bfloat16` dt
 
 | qkv  | proj | lin1 | lin2 | time | memory | img/s |
 | ---- | ---- | ---- | ---- | ---- | ------ | ----- |
-| None | None | None | None | 1361.733349 | 15.808660 | 23.499461 |
-| None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.151100 | 15.462827 | 25.699692 |
-| None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.411250 | 25.578562 |
-| sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.426255 | 12.705007 | 25.287922 |
-| sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.955840 | 12.704523 | 25.098909 |
+| None | None | None | None | 1361.73 | 15.81 | 23.50 |
+| None | None | sparse (cusparselt) | sparse (cusparselt) | 1245.15 | 15.46 | 25.70 |
+| None | None | sparse (cutlass) | sparse (cutlass) | 1251.047651 | 15.41 | 25.59 |
+| sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | sparse (cusparselt) | 1265.43 | 12.71 | 25.29|
+| sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | sparse (cutlass) | 1274.96 | 12.70 | 25.10 |
 
 #### BERT