diff --git a/.github/workflows/main_ci.yml b/.github/workflows/main_ci.yml index 4de03b7f6..c2e6ca351 100644 --- a/.github/workflows/main_ci.yml +++ b/.github/workflows/main_ci.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize "mypy<1.0.0" + python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize Jinja2 bz2file pandas scikit-learn numpy "mypy<1.0.0" - if: ${{matrix.platform == 'macos-latest'}} name: Install MacOS deps run: | diff --git a/darshan-util/pydarshan/darshan/glob_feature/__init__.py b/darshan-util/pydarshan/darshan/glob_feature/__init__.py new file mode 100644 index 000000000..060f8bf81 --- /dev/null +++ b/darshan-util/pydarshan/darshan/glob_feature/__init__.py @@ -0,0 +1,5 @@ +""" +Creates a DataFrame with two columns ("glob_filename" and "glob_count") +based on the files read by a .darshan file. +""" + diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py new file mode 100644 index 000000000..c31e7f760 --- /dev/null +++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py @@ -0,0 +1,211 @@ +# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file. +# The script utilizes agglomerative hierarchical clustering to effectively group similar file paths together, based on their characteristics. +# It then displays a dataframe where one file represents a group and uses [.*] to show where filepaths within a group differ +# The result of this process is an HTML report that provides a comprehensive overview of the grouped paths and their respective counts. +# Command to run: python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file +# Command to run with verbose: verbose will display all the files under the representing file +# python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file -v + +import argparse +import pandas as pd +import darshan +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import AgglomerativeClustering +from sklearn.metrics import silhouette_score +import numpy as np +import os + + +def main(log_path, output_path, verbose): + + report = darshan.DarshanReport(log_path) + df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) + df = df[df["filename_glob"].str.contains(r"/.*")] + + num_files = len(df) + optimal_k = 2 # Initialize optimal_k to 2 + if num_files == 1: + print("Only one file detected.") + optimal_k = 1 + # Process and save results for the single file + grouped_paths = {0: [df["filename_glob"].iloc[0]]} + new_paths = [(path, 1) for _, paths in grouped_paths.items() for path in paths] + + print("grouped_paths", grouped_paths) + + else: + + # Convert strings to feature vectors + vectorizer = TfidfVectorizer() + X = vectorizer.fit_transform(df["filename_glob"]) + print("X is:", X) + + # Determine the maximum number of clusters dynamically + max_clusters = int(np.sqrt(len(df))) + + silhouette_scores = [] + for k in range(2, max_clusters + 1): + print("max clusters is", max_clusters) + # Perform clustering + clustering = AgglomerativeClustering(n_clusters=k) + clusters = clustering.fit_predict(X.toarray()) + + # Calculate the silhouette score + score = silhouette_score(X, clusters) + print("clusters are:", clusters) + silhouette_scores.append(score) + + + + # Find the optimal number of clusters based on the silhouette scores + optimal_k = np.argmax(silhouette_scores) + 2 # Add 2 because range starts from 2 + + print("Optimal number of clusters:", optimal_k) + + # Perform clustering with the optimal number of clusters + clustering = AgglomerativeClustering(n_clusters=optimal_k) + clusters = clustering.fit_predict(X.toarray()) + print("clusters are", clusters) + grouped_paths = {} + for i, cluster_label in enumerate(clusters): + if cluster_label not in grouped_paths: + grouped_paths[cluster_label] = [] + grouped_paths[cluster_label].append(df["filename_glob"].iloc[i]) + + # Group paths based on file extensions + grouped_by_extension = {} + for cluster_label, paths in grouped_paths.items(): + grouped_by_extension[cluster_label] = {} + for path in paths: + file_extension = os.path.splitext(path)[1] + if file_extension not in grouped_by_extension[cluster_label]: + grouped_by_extension[cluster_label][file_extension] = [] + grouped_by_extension[cluster_label][file_extension].append(path) + + new_paths = [] + for _, group in grouped_paths.items(): + if len(group) > 1: + merged_path = "" + max_length = max(len(path) for path in group) + differing_chars_encountered = False + common_extension = None + + for i in range(max_length): + chars = set(path[i] if len(path) > i else "" for path in group) + if len(chars) == 1: + merged_path += chars.pop() + differing_chars_encountered = True + else: + if differing_chars_encountered: + merged_path += "(.*)" + differing_chars_encountered = False + break + + # Check if all paths have the same file extension + extensions = [os.path.splitext(path)[1] for path in group] + common_extension = None + if len(set(extensions)) == 1: + common_extension = extensions[0] + + + # Append the common extension if it exists and it's not already in the merged_path + if common_extension and common_extension not in merged_path: + merged_path += common_extension + + new_paths.append((merged_path, len(group))) + else: + new_paths.append((group[0], 1)) + + + + if verbose: + new_paths_verbose = [] + + # Sort grouped_paths based on the size of each group (in descending order) + sorted_groups = sorted(grouped_paths.items(), key=lambda x: len(x[1]), reverse=True) + + for cluster_label, paths in sorted_groups: + + if len(paths) > 1: + merged_path = "" + max_length = max(len(path) for path in paths) + differing_chars_encountered = False + common_extension = None + + + for i in range(max_length): + chars = set(path[i] if len(path) > i else "" for path in paths) + if len(chars) == 1: + merged_path += chars.pop() + differing_chars_encountered = True + else: + if differing_chars_encountered: + merged_path += "(.*)" + differing_chars_encountered = False + break + + # Check if all paths have the same file extension + extensions = [os.path.splitext(path)[1] for path in paths] + common_extension = None + if len(set(extensions)) == 1: + common_extension = extensions[0] + + # Append the merged path if it's not already in the new_paths_verbose list + if merged_path and (merged_path, len(paths)) not in new_paths_verbose: + new_paths_verbose.append((merged_path, len(paths))) + + # Append the individual paths beneath the merged path + new_paths_verbose.extend([(f" {path}", 1) for path in paths]) + else: + new_paths_verbose.append((group[0], 1)) + + + df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"]) + print(df_verbose.to_string(index=False)) + + + # Display or save the DataFrame using pandas styler + if verbose: + df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"]) + styled_html = df_verbose.style.background_gradient(axis=0, cmap="viridis", gmap=df_verbose["glob_count"]) + styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"}) + styled_html.hide(axis="index") + styled_html.set_table_styles([ + {"selector": "", "props": [("border", "1px solid grey")]}, + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, + {"selector": "th", "props": [("border", "1px solid grey")]} + ]) + html = styled_html.to_html() + + with open(output_path, "w") as html_file: + html_file.write(html) + + else: + df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) + df = df.sort_values(by="glob_count", ascending=False) + + styled_html = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) + styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"}) + styled_html.hide(axis="index") + styled_html.set_table_styles([ + {"selector": "", "props": [("border", "1px solid grey")]}, + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, + {"selector": "th", "props": [("border", "1px solid grey")]} + ]) + html = styled_html.to_html() + + with open(output_path, "w") as html_file: + html_file.write(html) + + print("Styled results saved to:", output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") + parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file") + parser.add_argument('-v', '--verbose', action='store_true', help="Display verbose output") + args = parser.parse_args() + main(log_path=args.log_path, output_path=args.output_path, verbose=args.verbose) + + diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py new file mode 100644 index 000000000..4bd44e00f --- /dev/null +++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py @@ -0,0 +1,188 @@ +# Note: Some tests may currently fail, as this script is still under active development. +# The log files here are from the the darshan-logs repository +import sys +import os +import darshan +from darshan.log_utils import get_log_path +import pandas as pd +from pandas.testing import assert_frame_equal +import pytest +import re +print(sys.path) +from darshan.glob_feature import glob_feature + +@pytest.mark.parametrize("log_name, expected_df", [ + ("e3sm_io_heatmap_only.darshan", + pd.DataFrame({"filename_glob": + ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc", + "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], + "glob_count": [2, 1]})), + + + ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan", + pd.DataFrame({"filename_glob": + ["/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/(.*).nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)", + "/projects/ccsm/inputdata/atm/cam/physprops/(.*).nc", + "/projects/ccsm/inputdata/atm/cam/(.*).nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).nml", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.c(.*).nc", + "/projects/ccsm/inputdata/lnd/clm2/(.*).nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)", + "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).170927-064246", + "/projects/ccsm/inputdata/atm/waccm/(.*).nc", + "/projects/ccsm/inputdata/(.*).nc"], + "glob_count": [18, 14, 14, 13, 9, 9, 6, 6, 5, 3, 3]})), + + + ("darshan-apmpi-2nodes-64mpi.darshan", + pd.DataFrame({"filename_glob": + ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/(.*)"], + "glob_count": [2]})), + + + ("mpi-io-test.darshan", + pd.DataFrame({"filename_glob": + ["/global/cscratch1/sd/ssnyder/tmp/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + + ("e3sm_io_heatmap_and_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc", + "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], + "glob_count": [2, 1]})), + + + ("hdf5_diagonal_write_1_byte_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), + + ("hdf5_diagonal_write_bytes_range_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), + + ("hdf5_diagonal_write_half_flush_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), + + ("hdf5_diagonal_write_half_ranks_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), + + ("hdf5_file_opens_only.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy/(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/__pycache__/(.*).pyc", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/lib-dynload/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/json/(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).pyc", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/importlib/(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/ctypes", + "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_(.*).h5"], + "glob_count": [140, 62, 47, 37, 22, 17, 15, 8, 6, 6, 4, 4, 3]})), + + + ("treddy_h5d_no_h5f.darshan", + pd.DataFrame({"filename_glob": + ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_(.*).pyc", + "/home/treddy/rough_work/darshan/issue_709/rank_(.*)"], + "glob_count": [15, 6]})), + + + ("shane_ior-HDF5_id438090-438090_11-9-41522-17417065676046418211_1.darshan", + pd.DataFrame({"filename_glob": + ["/home/shane/software/ior/build/testFile(.*)"], + "glob_count": [2]})), + + ("shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan", + pd.DataFrame({"filename_glob": + ["/home/shane/software/ior/build/testFile(.*)"], + "glob_count": [2]})), + + + ("partial_data_stdio.darshan", + pd.DataFrame({"filename_glob": + ["/home/carns/working/dbg/darshan-examples/foo(.*)", + "/home/carns/working/dbg/darshan-examples/test.out"], + "glob_count": [1021, 1]})), + + + ("partial_data_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/home/carns/working/dbg/darshan-examples/test.out"], + "glob_count": [1]})), + + + ("mpi-io-test-ppc64-3.0.0.darshan", + pd.DataFrame({"filename_glob": + ["/gpfs/mira-fs0/projects/SSSPPg/snyder/tmp/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + ("mpi-io-test-x86_64-3.0.0.darshan", + pd.DataFrame({"filename_glob": + ["/tmp/tmp/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + ("mpi-io-test-x86_64-3.4.0-pre1.darshan", + pd.DataFrame({"filename_glob": + ["/tmp/test/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + + ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan", + pd.DataFrame({"filename_glob": + ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_(.*)_write_1_bytes"], + "glob_count": [32]})), + + + ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan", + pd.DataFrame({"filename_glob": + ["/home/laytonjb/PROJECTS/DARSHAN/TEST/jeff.txt"], + "glob_count": [1]})), +]) + + +def test_glob_tables(tmpdir, log_name, expected_df): + print("Current working directory:", os.getcwd()) + log_path = get_log_path(log_name) + print("log path is", log_path) + with tmpdir.as_cwd(): + cwd = os.getcwd() + outfile = os.path.join(cwd, "output.html") + glob_feature.main(log_path, outfile, verbose=False) + actual_table = pd.read_html(outfile)[0] + print("log path is", log_path) + + # Print the contents of the DataFrames + print("Actual DataFrame:") + print(actual_table) + print("Expected DataFrame:") + print(expected_df) + + assert_frame_equal(actual_table, expected_df)