diff --git a/.github/workflows/main_ci.yml b/.github/workflows/main_ci.yml
index 4de03b7f6..c2e6ca351 100644
--- a/.github/workflows/main_ci.yml
+++ b/.github/workflows/main_ci.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize "mypy<1.0.0"
+          python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize Jinja2 bz2file pandas scikit-learn numpy "mypy<1.0.0"
       - if: ${{matrix.platform == 'macos-latest'}}
         name: Install MacOS deps
         run: |
diff --git a/darshan-util/pydarshan/darshan/glob_feature/__init__.py b/darshan-util/pydarshan/darshan/glob_feature/__init__.py
new file mode 100644
index 000000000..060f8bf81
--- /dev/null
+++ b/darshan-util/pydarshan/darshan/glob_feature/__init__.py
@@ -0,0 +1,5 @@
+"""
+Creates a DataFrame with two columns ("glob_filename" and "glob_count")
+based on the files read by a .darshan file.
+"""
+
diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
new file mode 100644
index 000000000..c31e7f760
--- /dev/null
+++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
@@ -0,0 +1,211 @@
+# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
+# The script utilizes agglomerative hierarchical clustering to effectively group similar file paths together, based on their characteristics.
+# It then displays a dataframe where one file represents a group and uses [.*] to show where filepaths within a group differ 
+# The result of this process is an HTML report that provides a comprehensive overview of the grouped paths and their respective counts. 
+# Command to run: python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file 
+# Command to run with verbose: verbose will display all the files under the representing file 
+# python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file  -v
+
+import argparse
+import pandas as pd
+import darshan
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import silhouette_score
+import numpy as np
+import os
+
+
+def main(log_path, output_path, verbose):
+
+    report = darshan.DarshanReport(log_path)
+    df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
+    df = df[df["filename_glob"].str.contains(r"/.*")]
+
+    num_files = len(df)
+    optimal_k = 2  # Initialize optimal_k to 2
+    if num_files == 1:
+        print("Only one file detected.")
+        optimal_k = 1
+        # Process and save results for the single file
+        grouped_paths = {0: [df["filename_glob"].iloc[0]]}
+        new_paths = [(path, 1) for _, paths in grouped_paths.items() for path in paths]
+
+        print("grouped_paths", grouped_paths)
+
+    else:
+
+        # Convert strings to feature vectors
+        vectorizer = TfidfVectorizer()
+        X = vectorizer.fit_transform(df["filename_glob"])
+        print("X is:", X)
+
+        # Determine the maximum number of clusters dynamically
+        max_clusters = int(np.sqrt(len(df)))
+
+        silhouette_scores = []
+        for k in range(2, max_clusters + 1):
+            print("max clusters is", max_clusters)
+            # Perform clustering
+            clustering = AgglomerativeClustering(n_clusters=k)
+            clusters = clustering.fit_predict(X.toarray())
+
+            # Calculate the silhouette score
+            score = silhouette_score(X, clusters)
+            print("clusters are:", clusters)
+            silhouette_scores.append(score)
+
+
+
+            # Find the optimal number of clusters based on the silhouette scores
+            optimal_k = np.argmax(silhouette_scores) + 2  # Add 2 because range starts from 2
+
+            print("Optimal number of clusters:", optimal_k)
+
+        # Perform clustering with the optimal number of clusters
+        clustering = AgglomerativeClustering(n_clusters=optimal_k)
+        clusters = clustering.fit_predict(X.toarray())
+        print("clusters are", clusters)
+        grouped_paths = {}
+        for i, cluster_label in enumerate(clusters):
+            if cluster_label not in grouped_paths:
+                grouped_paths[cluster_label] = []
+            grouped_paths[cluster_label].append(df["filename_glob"].iloc[i])
+
+        # Group paths based on file extensions
+        grouped_by_extension = {}
+        for cluster_label, paths in grouped_paths.items():
+            grouped_by_extension[cluster_label] = {}
+            for path in paths:
+                 file_extension = os.path.splitext(path)[1]
+                 if file_extension not in grouped_by_extension[cluster_label]:
+                    grouped_by_extension[cluster_label][file_extension] = []
+                 grouped_by_extension[cluster_label][file_extension].append(path)
+
+        new_paths = []
+        for _, group in grouped_paths.items():
+            if len(group) > 1:
+                merged_path = ""
+                max_length = max(len(path) for path in group)
+                differing_chars_encountered = False
+                common_extension = None
+
+                for i in range(max_length):
+                    chars = set(path[i] if len(path) > i else "" for path in group)
+                    if len(chars) == 1:
+                        merged_path += chars.pop()
+                        differing_chars_encountered = True
+                    else:
+                        if differing_chars_encountered:
+                            merged_path += "(.*)"
+                            differing_chars_encountered = False
+                            break
+
+                # Check if all paths have the same file extension
+                extensions = [os.path.splitext(path)[1] for path in group]
+                common_extension = None
+                if len(set(extensions)) == 1:
+                    common_extension = extensions[0]
+
+
+                # Append the common extension if it exists and it's not already in the merged_path
+                if common_extension and common_extension not in merged_path:
+                    merged_path += common_extension
+
+                new_paths.append((merged_path, len(group)))
+            else:
+                new_paths.append((group[0], 1))
+
+
+
+        if verbose:
+            new_paths_verbose = []
+
+            # Sort grouped_paths based on the size of each group (in descending order)
+            sorted_groups = sorted(grouped_paths.items(), key=lambda x: len(x[1]), reverse=True)
+
+            for cluster_label, paths in sorted_groups:
+
+                if len(paths) > 1:
+                    merged_path = ""
+                    max_length = max(len(path) for path in paths)
+                    differing_chars_encountered = False
+                    common_extension = None
+
+
+                    for i in range(max_length):
+                        chars = set(path[i] if len(path) > i else "" for path in paths)
+                        if len(chars) == 1:
+                            merged_path += chars.pop()
+                            differing_chars_encountered = True
+                        else:
+                            if differing_chars_encountered:
+                                merged_path += "(.*)"
+                                differing_chars_encountered = False
+                                break
+
+                    # Check if all paths have the same file extension
+                    extensions = [os.path.splitext(path)[1] for path in paths]
+                    common_extension = None
+                    if len(set(extensions)) == 1:
+                        common_extension = extensions[0]
+
+                    # Append the merged path if it's not already in the new_paths_verbose list
+                    if merged_path and (merged_path, len(paths)) not in new_paths_verbose:
+                        new_paths_verbose.append((merged_path, len(paths)))
+
+                    # Append the individual paths beneath the merged path
+                    new_paths_verbose.extend([(f"    {path}", 1) for path in paths])
+                else:
+                    new_paths_verbose.append((group[0], 1))
+
+
+            df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"])
+            print(df_verbose.to_string(index=False))
+
+
+        # Display or save the DataFrame using pandas styler
+        if verbose:
+            df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"])
+            styled_html = df_verbose.style.background_gradient(axis=0, cmap="viridis", gmap=df_verbose["glob_count"])
+            styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"})
+            styled_html.hide(axis="index")
+            styled_html.set_table_styles([
+                {"selector": "", "props": [("border", "1px solid grey")]},
+                {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+                {"selector": "th", "props": [("border", "1px solid grey")]}
+            ])
+            html = styled_html.to_html()
+
+            with open(output_path, "w") as html_file:
+                html_file.write(html)
+
+        else:
+            df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
+            df = df.sort_values(by="glob_count", ascending=False)
+
+            styled_html = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
+            styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"})
+            styled_html.hide(axis="index")
+            styled_html.set_table_styles([
+                {"selector": "", "props": [("border", "1px solid grey")]},
+                {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+                {"selector": "th", "props": [("border", "1px solid grey")]}
+            ])
+            html = styled_html.to_html()
+
+            with open(output_path, "w") as html_file:
+                html_file.write(html)
+
+            print("Styled results saved to:", output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
+    parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file")
+    parser.add_argument('-v', '--verbose', action='store_true', help="Display verbose output")
+    args = parser.parse_args()
+    main(log_path=args.log_path, output_path=args.output_path, verbose=args.verbose)
+
+
diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
new file mode 100644
index 000000000..4bd44e00f
--- /dev/null
+++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
@@ -0,0 +1,188 @@
+# Note: Some tests may currently fail, as this script is still under active development.
+# The log files here are from the the darshan-logs repository
+import sys
+import os
+import darshan
+from darshan.log_utils import get_log_path
+import pandas as pd
+from pandas.testing import assert_frame_equal
+import pytest
+import re
+print(sys.path)
+from darshan.glob_feature import glob_feature
+
+@pytest.mark.parametrize("log_name, expected_df", [
+     ("e3sm_io_heatmap_only.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc",
+                    "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
+                    "glob_count": [2, 1]})),
+
+
+      ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/(.*).nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)",
+                    "/projects/ccsm/inputdata/atm/cam/physprops/(.*).nc",
+                    "/projects/ccsm/inputdata/atm/cam/(.*).nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).nml",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.c(.*).nc",
+                    "/projects/ccsm/inputdata/lnd/clm2/(.*).nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)",
+                    "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).170927-064246",
+                    "/projects/ccsm/inputdata/atm/waccm/(.*).nc",
+                    "/projects/ccsm/inputdata/(.*).nc"],
+                    "glob_count": [18, 14, 14, 13, 9, 9, 6, 6, 5, 3, 3]})),
+
+
+     ("darshan-apmpi-2nodes-64mpi.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/(.*)"],
+                    "glob_count": [2]})),
+
+
+     ("mpi-io-test.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/global/cscratch1/sd/ssnyder/tmp/mpi-io-test.tmp.dat"],
+                    "glob_count": [1]})),
+
+
+     ("e3sm_io_heatmap_and_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc",
+                    "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
+                    "glob_count": [2, 1]})),
+
+
+     ("hdf5_diagonal_write_1_byte_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
+
+     ("hdf5_diagonal_write_bytes_range_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
+
+     ("hdf5_diagonal_write_half_flush_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
+
+     ("hdf5_diagonal_write_half_ranks_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
+
+     ("hdf5_file_opens_only.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy/(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/__pycache__/(.*).pyc",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/lib-dynload/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/json/(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).pyc",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/importlib/(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/ctypes",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_(.*).h5"],
+                                    "glob_count": [140, 62, 47, 37, 22, 17, 15, 8, 6, 6, 4, 4, 3]})),
+
+
+     ("treddy_h5d_no_h5f.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_(.*).pyc",
+                    "/home/treddy/rough_work/darshan/issue_709/rank_(.*)"],
+                    "glob_count": [15, 6]})),
+
+
+     ("shane_ior-HDF5_id438090-438090_11-9-41522-17417065676046418211_1.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/home/shane/software/ior/build/testFile(.*)"],
+                    "glob_count": [2]})),
+
+     ("shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/home/shane/software/ior/build/testFile(.*)"],
+                    "glob_count": [2]})),
+
+
+     ("partial_data_stdio.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/home/carns/working/dbg/darshan-examples/foo(.*)",
+                   "/home/carns/working/dbg/darshan-examples/test.out"],
+                   "glob_count": [1021, 1]})),
+
+
+     ("partial_data_dxt.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/home/carns/working/dbg/darshan-examples/test.out"],
+                   "glob_count": [1]})),
+
+
+     ("mpi-io-test-ppc64-3.0.0.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/gpfs/mira-fs0/projects/SSSPPg/snyder/tmp/mpi-io-test.tmp.dat"],
+                   "glob_count": [1]})),
+
+     ("mpi-io-test-x86_64-3.0.0.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/tmp/tmp/mpi-io-test.tmp.dat"],
+                   "glob_count": [1]})),
+
+     ("mpi-io-test-x86_64-3.4.0-pre1.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/tmp/test/mpi-io-test.tmp.dat"],
+                   "glob_count": [1]})),
+
+
+     ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_(.*)_write_1_bytes"],
+                   "glob_count": [32]})),
+
+
+     ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/home/laytonjb/PROJECTS/DARSHAN/TEST/jeff.txt"],
+                   "glob_count": [1]})),
+])
+
+
+def test_glob_tables(tmpdir, log_name, expected_df):
+    print("Current working directory:", os.getcwd())
+    log_path = get_log_path(log_name)
+    print("log path is", log_path)
+    with tmpdir.as_cwd():
+        cwd = os.getcwd()
+        outfile = os.path.join(cwd, "output.html")
+        glob_feature.main(log_path, outfile, verbose=False)
+        actual_table = pd.read_html(outfile)[0]
+        print("log path is", log_path)
+
+        # Print the contents of the DataFrames
+        print("Actual DataFrame:")
+        print(actual_table)
+        print("Expected DataFrame:")
+        print(expected_df)
+
+        assert_frame_equal(actual_table, expected_df)