diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 4eecb933..2dcb05da 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -421,6 +421,9 @@ Miscellaneous .. autodata:: sdp.processors.ipl.ipl_processors.InferenceCommandGenerator :annotation: +.. autodata:: sdp.processors.DropSpecifiedFields + :annotation: + .. _sdp-base-classes: Base classes diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 79f2205e..db7ae781 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -89,6 +89,7 @@ RenameFields, SortManifest, SplitOnFixedDuration, + DropSpecifiedFields, ) from sdp.processors.modify_manifest.create_manifest import ( CreateCombinedManifests, diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py index 98ad1fa3..ea2fdf67 100644 --- a/sdp/processors/modify_manifest/common.py +++ b/sdp/processors/modify_manifest/common.py @@ -401,3 +401,38 @@ def process(self): with open(self.output_manifest_file, "wt", encoding="utf8") as fout: for _, line in m3.iterrows(): fout.write(json.dumps(dict(line), ensure_ascii=False) + "\n") + + +class DropSpecifiedFields(BaseProcessor): + """ + A processor that removes specified fields from each data entry in the manifest. + + This processor reads an input manifest line by line, drops the fields listed in `fields_to_drop` + from each JSON entry, and writes the cleaned entries to the output manifest. + + Args: + fields_to_drop (List[str]): A list of keys to remove from each manifest entry. + **kwargs: Additional arguments passed to the BaseProcessor (e.g., input/output manifest paths). + + Returns: + A line-delimited JSON manifest, where each entry is the same as the input, + but with the specified fields removed. + """ + + def __init__(self, fields_to_drop: List[str], **kwargs): + super().__init__(**kwargs) + self.fields_to_drop = fields_to_drop + + def process(self): + # Open the input and output manifest files + with open(self.input_manifest_file, "rt", encoding="utf8") as fin, open( + self.output_manifest_file, "wt", encoding="utf8" + ) as fout: + # Iterate over each line (entry) in the input manifest + for line in tqdm(fin): + # Parse JSON entry from the current line + entry = json.loads(line) + # Create a new entry by excluding the specified fields + new_line = {field: entry[field] for field in entry if field not in self.fields_to_drop} + # Write the cleaned entry to the output manifest + fout.write(json.dumps(new_line, ensure_ascii=False) + "\n") \ No newline at end of file