diff --git a/CHANGELOG.md b/CHANGELOG.md index 722f5540b..5d72591d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## Unreleased changes + +### What's Changed + +* Major refactoring of block life cycle, with better possibilities for validation of block data before and after saving by @ml-evs in #1311 +* Replace browser-native dialogs with custom datalab dialog service by @BenjaminCharmes in https://github.com/datalab-org/datalab/pull/1212 +* Resolve CVEs on mermaid.js and cross-spawn by @dependabot[bot] in https://github.com/datalab-org/datalab/pull/1317 + +**Full Changelog**: https://github.com/datalab-org/datalab/compare/v0.6.2...main + ## v0.6.2 (August 2025) This patch release adds a hotfix for broken media blocks when encoding TIF files (#1318). diff --git a/pydatalab/pyproject.toml b/pydatalab/pyproject.toml index 74c25507b..6503e8fad 100644 --- a/pydatalab/pyproject.toml +++ b/pydatalab/pyproject.toml @@ -140,6 +140,8 @@ filterwarnings = [ "ignore:.*np.bool8*:DeprecationWarning", "ignore::pytest.PytestUnraisableExceptionWarning", "ignore:.*JCAMP-DX key without value*:UserWarning", + "ignore:.*Unable to find wavenumber unit*:UserWarning", + "ignore:.*Unable to find wavenumber offset*:UserWarning", ] [tool.liccheck] diff --git a/pydatalab/schemas/cell.json b/pydatalab/schemas/cell.json index f891bc9af..a3a97ed19 100644 --- a/pydatalab/schemas/cell.json +++ b/pydatalab/schemas/cell.json @@ -176,7 +176,7 @@ "definitions": { "DataBlockResponse": { "title": "DataBlockResponse", - "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.", + "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.\n\nIt is expected but not mandatory that this model will be extended by the specific block type\nwhere possible.", "type": "object", "properties": { "blocktype": { @@ -214,6 +214,20 @@ "type": "string" } }, + "errors": { + "title": "Errors", + "type": "array", + "items": { + "type": "string" + } + }, + "warnings": { + "title": "Warnings", + "type": "array", + "items": { + "type": "string" + } + }, "b64_encoded_image": { "title": "B64 Encoded Image", "type": "object", @@ -223,7 +237,15 @@ }, "bokeh_plot_data": { "title": "Bokeh Plot Data", - "type": "string" + "type": "object" + }, + "computed": { + "title": "Computed", + "type": "object" + }, + "metadata": { + "title": "Metadata", + "type": "object" } }, "required": [ diff --git a/pydatalab/schemas/equipment.json b/pydatalab/schemas/equipment.json index 65dc68c08..9affcfda8 100644 --- a/pydatalab/schemas/equipment.json +++ b/pydatalab/schemas/equipment.json @@ -140,7 +140,7 @@ "definitions": { "DataBlockResponse": { "title": "DataBlockResponse", - "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.", + "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.\n\nIt is expected but not mandatory that this model will be extended by the specific block type\nwhere possible.", "type": "object", "properties": { "blocktype": { @@ -178,6 +178,20 @@ "type": "string" } }, + "errors": { + "title": "Errors", + "type": "array", + "items": { + "type": "string" + } + }, + "warnings": { + "title": "Warnings", + "type": "array", + "items": { + "type": "string" + } + }, "b64_encoded_image": { "title": "B64 Encoded Image", "type": "object", @@ -187,7 +201,15 @@ }, "bokeh_plot_data": { "title": "Bokeh Plot Data", - "type": "string" + "type": "object" + }, + "computed": { + "title": "Computed", + "type": "object" + }, + "metadata": { + "title": "Metadata", + "type": "object" } }, "required": [ diff --git a/pydatalab/schemas/sample.json b/pydatalab/schemas/sample.json index e5ec8b5fd..3c8ebfb91 100644 --- a/pydatalab/schemas/sample.json +++ b/pydatalab/schemas/sample.json @@ -229,7 +229,7 @@ }, "DataBlockResponse": { "title": "DataBlockResponse", - "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.", + "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.\n\nIt is expected but not mandatory that this model will be extended by the specific block type\nwhere possible.", "type": "object", "properties": { "blocktype": { @@ -267,6 +267,20 @@ "type": "string" } }, + "errors": { + "title": "Errors", + "type": "array", + "items": { + "type": "string" + } + }, + "warnings": { + "title": "Warnings", + "type": "array", + "items": { + "type": "string" + } + }, "b64_encoded_image": { "title": "B64 Encoded Image", "type": "object", @@ -276,7 +290,15 @@ }, "bokeh_plot_data": { "title": "Bokeh Plot Data", - "type": "string" + "type": "object" + }, + "computed": { + "title": "Computed", + "type": "object" + }, + "metadata": { + "title": "Metadata", + "type": "object" } }, "required": [ diff --git a/pydatalab/schemas/startingmaterial.json b/pydatalab/schemas/startingmaterial.json index 4ab9c9392..d6f180279 100644 --- a/pydatalab/schemas/startingmaterial.json +++ b/pydatalab/schemas/startingmaterial.json @@ -282,7 +282,7 @@ }, "DataBlockResponse": { "title": "DataBlockResponse", - "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.", + "description": "A generic response model for a block, i.e., what is stored in `self.data`\nin the corresponding DataBlock class.\n\nIt is expected but not mandatory that this model will be extended by the specific block type\nwhere possible.", "type": "object", "properties": { "blocktype": { @@ -320,6 +320,20 @@ "type": "string" } }, + "errors": { + "title": "Errors", + "type": "array", + "items": { + "type": "string" + } + }, + "warnings": { + "title": "Warnings", + "type": "array", + "items": { + "type": "string" + } + }, "b64_encoded_image": { "title": "B64 Encoded Image", "type": "object", @@ -329,7 +343,15 @@ }, "bokeh_plot_data": { "title": "Bokeh Plot Data", - "type": "string" + "type": "object" + }, + "computed": { + "title": "Computed", + "type": "object" + }, + "metadata": { + "title": "Metadata", + "type": "object" } }, "required": [ diff --git a/pydatalab/src/pydatalab/apps/chat/blocks.py b/pydatalab/src/pydatalab/apps/chat/blocks.py index 0f59df075..5161f17cf 100644 --- a/pydatalab/src/pydatalab/apps/chat/blocks.py +++ b/pydatalab/src/pydatalab/apps/chat/blocks.py @@ -1,18 +1,29 @@ import json -import os +import warnings -from langchain_anthropic import ChatAnthropic -from langchain_core.language_models.chat_models import BaseChatModel -from langchain_openai import ChatOpenAI +from langchain_core.messages import AIMessage, HumanMessage, SystemMessage +from pydantic import Field from pydatalab.blocks.base import DataBlock from pydatalab.logger import LOGGER from pydatalab.models import ITEM_MODELS +from pydatalab.models.blocks import DataBlockResponse from pydatalab.utils import CustomJSONEncoder +from .models import AVAILABLE_MODELS, ModelCard + __all__ = ("ChatBlock",) +class ChatBlockResponse(DataBlockResponse): + messages: list[dict] = Field(default_factory=list) + prompt: str | None + model: str + available_models: dict[str, ModelCard] = Field(exclude=True) + token_count: int | None + temperature: float + + class ChatBlock(DataBlock): """This block uses API calls to external LLMs via Langchain to provide a conversational interface to a user's data. @@ -33,11 +44,12 @@ class ChatBlock(DataBlock): """ + block_db_model = ChatBlockResponse + blocktype = "chat" description = "Virtual LLM assistant block allows you to converse with your data." name = "Whinchat assistant" accepted_file_extensions = None - chat_client: BaseChatModel | None = None __supports_collections = True @@ -48,106 +60,58 @@ class ChatBlock(DataBlock): Be as concise as possible. When saying your name, type a bird emoji right after whinchat 🐦. """, "temperature": 0.2, - "error_message": None, "model": "gpt-4o", - "available_models": { - "claude-3-5-sonnet-20241022": { - "name": "claude-3-5-sonnet-20241022", - "context_window": 200_000, - "input_cost_usd_per_MTok": 3.00, - "output_cost_usd_per_MTok": 15.00, - }, - "claude-3-5-haiku-20241022": { - "name": "claude-3-haiku-20241022", - "context_window": 200_000, - "input_cost_usd_per_MTok": 1.00, - "output_cost_usd_per_MTok": 5.00, - }, - "claude-3-haiku-20240307": { - "name": "claude-3-haiku-20240307", - "context_window": 200_000, - "input_cost_usd_per_MTok": 0.25, - "output_cost_usd_per_MTok": 1.25, - }, - "claude-3-opus-20240229": { - "name": "claude-3-opus-20240229", - "context_window": 200000, - "input_cost_usd_per_MTok": 15.00, - "output_cost_usd_per_MTok": 75.00, - }, - "gpt-4o": { - "name": "gpt-4o", - "context_window": 128000, - "input_cost_usd_per_MTok": 5.00, - "output_cost_usd_per_MTok": 15.00, - }, - "gpt-4o-mini": { - "name": "gpt-4o-mini", - "context_window": 128_000, - "input_cost_usd_per_MTok": 0.15, - "output_cost_usd_per_MTok": 0.60, - }, - "gpt-4": { - "name": "gpt-4", - "context_window": 8192, - "input_cost_usd_per_MTok": 30.00, - "output_cost_usd_per_MTok": 60.00, - }, - "gpt-4-turbo": { - "name": "gpt-4-turbo", - "context_window": 128000, - "input_cost_usd_per_MTok": 10.00, - "output_cost_usd_per_MTok": 30.00, - }, - }, + "available_models": AVAILABLE_MODELS, } - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def to_db(self): - """returns a dictionary with the data for this - block, ready to be input into mongodb""" - self.render() - return super().to_db() - @property def plot_functions(self): return (self.render,) - def render(self): - if not self.data.get("messages"): - if (item_id := self.data.get("item_id")) is not None: - info_json = self._prepare_item_json_for_chat(item_id) - elif (collection_id := self.data.get("collection_id")) is not None: - info_json = self._prepare_collection_json_for_chat(collection_id) - else: - raise RuntimeError("No item or collection id provided") + def start_conversation( + self, item_data: dict | None = None, collection_data: dict | None = None + ): + """Starts a new conversation with the system prompt, embedding + the current item or collection data. - self.data["messages"] = [ - { - "role": "system", - "content": self.defaults["system_prompt"], - }, - { - "role": "user", - "content": f"""Here is the JSON data for the current item(s): {info_json}. + """ + + if (item_id := self.data.get("item_id")) is not None: + info_json = self._prepare_item_json_for_chat(item_id, item_data=item_data) + elif (collection_id := self.data.get("collection_id")) is not None: + info_json = self._prepare_collection_json_for_chat( + collection_id, collection_data=collection_data + ) + else: + raise RuntimeError("No item or collection id provided") + + self.data["messages"] = [ + { + "role": "system", + "content": self.defaults["system_prompt"], + }, + { + "role": "user", + "content": f"""Here is the JSON data for the current item(s): {info_json}. Start with a friendly introduction and give me a one sentence summary of what this is (not detailed, no information about specific masses). """, - }, - ] + }, + ] + + def continue_conversation(self, prompt: str | None) -> None: + """Continues the conversation based on the passed user prompt. + + Parameters: + prompt: The textual response from the user. - if self.data.get("prompt") and self.data.get("prompt").strip(): + """ + if prompt and prompt.strip(): self.data["messages"].append( { "role": "user", - "content": self.data["prompt"], + "content": prompt, } ) self.data["prompt"] = None - else: - LOGGER.debug( - "Chat block: no prompt was provided (or prompt was entirely whitespace), so no inference will be performed" - ) try: if self.data["messages"][-1].role not in ("user", "system"): @@ -158,7 +122,7 @@ def render(self): if self.data.get("model") not in self.data.get("available_models", {}): bad_model = self.data.get("model") - self.data["error_message"] = ( + warnings.warn( f"Chatblock received an unknown or deprecated model: {bad_model}. Reverting to default model {self.defaults['model']}." ) self.data["model"] = self.defaults["model"] @@ -166,25 +130,14 @@ def render(self): try: model_name = self.data["model"] - model_dict = self.data["available_models"][model_name] - LOGGER.warning(f"Initializing chatblock with model: {model_name}") + model_cls = self.data["available_models"][model_name] - if model_name.startswith("claude"): - self.chat_client = ChatAnthropic( - anthropic_api_key=os.environ.get("ANTHROPIC_API_KEY"), - model=model_name, - ) - elif model_name.startswith("gpt"): - self.chat_client = ChatOpenAI( - api_key=os.environ.get("OPENAI_API_KEY"), - model=model_name, - ) + chat_client = model_cls.chat_client(model=model_cls.name) LOGGER.debug( - f'submitting request to API for completion with last message role "{self.data["messages"][-1]["role"]}" (message = {self.data["messages"][-1:]}). Temperature = {self.data["temperature"]} (type {type(self.data["temperature"])})' + f'submitting request to API for completion with last message role "{self.data["messages"][-1]["role"]}" (message = {self.data["messages"][-1:]}).' + f"Temperature = {self.data['temperature']} (type {type(self.data['temperature'])})" ) - from langchain_core.messages import AIMessage, HumanMessage, SystemMessage - # Convert your messages to the required format langchain_messages = [] for message in self.data["messages"]: @@ -195,54 +148,68 @@ def render(self): else: langchain_messages.append(AIMessage(content=message["content"])) - token_count = self.chat_client.get_num_tokens_from_messages(langchain_messages) + if model_cls.name == "langchain-debug": + token_count = sum(len(m["content"]) for m in self.data["messages"]) + else: + token_count = chat_client.get_num_tokens_from_messages(langchain_messages) self.data["token_count"] = token_count - if token_count >= model_dict["context_window"]: - self.data["error_message"] = ( - f"""This conversation has reached its maximum context size and the chatbot won't be able to respond further ({token_count} tokens, max: {model_dict["context_window"]}). Please make a new chat block to start fresh, or use a model with a larger context window""" + if token_count >= model_cls.context_window: + raise RuntimeError( + f"""This conversation has reached its maximum context size and the chatbot won't be able to respond further +({token_count} tokens, max: {model_cls.context_window}). +Please make a new chat block to start fresh, or use a model with a larger context window.""" ) - return # Call the chat client with the invoke method - response = self.chat_client.invoke(langchain_messages) + response = chat_client.invoke(langchain_messages) langchain_messages.append(response) - token_count = self.chat_client.get_num_tokens_from_messages(langchain_messages) - - self.data["token_count"] = token_count + # Now recalculate the token count after model output self.data["messages"].append({"role": "assistant", "content": response.content}) - self.data["error_message"] = None + if model_cls.name == "langchain-debug": + token_count = sum(len(m["content"]) for m in self.data["messages"]) + else: + token_count = chat_client.get_num_tokens_from_messages(langchain_messages) + self.data["token_count"] = token_count except Exception as exc: - LOGGER.debug("Received an error from API: %s", exc) - self.data["error_message"] = ( + raise RuntimeError( f"Received an error from the API: {exc}.\n\n Consider choosing a different model and reloading the block." ) - return - def _prepare_item_json_for_chat(self, item_id: str): + def render(self): + if not self.data.get("messages"): + self.start_conversation() + + self.continue_conversation(self.data.get("prompt")) + + def _prepare_item_json_for_chat(self, item_id: str, item_data: dict | None = None): from pydatalab.routes.v0_1.items import get_item_data - item_info = get_item_data(item_id, load_blocks=False).json + if item_data is None: + item_data = get_item_data(item_id).json + + if item_data["status"] != "success": + raise RuntimeError(f"Attempt to get item data for {item_id=} failed.") - model = ITEM_MODELS[item_info["item_data"]["type"]](**item_info["item_data"]) - if model.blocks_obj: - model.blocks_obj = { - k: value for k, value in model.blocks_obj.items() if value["blocktype"] != "chat" + item_model = ITEM_MODELS[item_data["item_data"]["type"]](**item_data["item_data"]) + if item_model.blocks_obj: + item_model.blocks_obj = { + k: block for k, block in item_model.blocks_obj.items() if block.blocktype != "chat" } - item_info = model.dict(exclude_none=True, exclude_unset=True) - item_info["type"] = model.type + item_data = item_model.dict(exclude_none=True, exclude_unset=True) + item_data["type"] = item_model.type # strip irrelevant or large fields item_filenames = { - str(file["immutable_id"]): file["name"] for file in item_info.get("files", []) + str(file["immutable_id"]): file["name"] for file in item_data.get("files", []) } - big_data_keys = ["bokeh_plot_data", "b64_encoded_image"] - for block in item_info.get("blocks_obj", {}).values(): + big_data_keys = ["bokeh_plot_data", "b64_encoded_image", "computed"] + for block in item_data.get("blocks_obj", {}).values(): block_fields_to_remove = ["item_id", "block_id", "collection_id"] + big_data_keys [block.pop(field, None) for field in block_fields_to_remove] @@ -251,13 +218,12 @@ def _prepare_item_json_for_chat(self, item_id: str): "acquisition_parameters", "carrier_offset_Hz", "nscans", - "processed_data", "processed_data_shape", "processing_parameters", "pulse_program", "selected_process", ] - [block.pop(field, None) for field in NMR_fields_to_remove] + [block["metadata"].pop(field, None) for field in NMR_fields_to_remove] # replace file_id with the actual filename file_id = block.pop("file_id", None) @@ -276,22 +242,22 @@ def _prepare_item_json_for_chat(self, item_id: str): ] for k in top_level_keys_to_remove: - item_info.pop(k, None) + item_data.pop(k, None) - for ind, f in enumerate(item_info.get("relationships", [])): - item_info["relationships"][ind] = { + for ind, f in enumerate(item_data.get("relationships", [])): + item_data["relationships"][ind] = { k: v for k, v in f.items() if k in ["item_id", "type", "relation"] } - item_info["files"] = [file["name"] for file in item_info.get("files", [])] - item_info["creators"] = [ - creator["display_name"] for creator in item_info.get("creators", []) + item_data["files"] = [file["name"] for file in item_data.get("files", [])] + item_data["creators"] = [ + creator["display_name"] for creator in item_data.get("creators", []) ] # move blocks from blocks_obj to a simpler list to further cut down tokens, # especially in alphanumeric block_id fields - item_info["blocks"] = [block for block in item_info.pop("blocks_obj", {}).values()] + item_data["blocks"] = [block for block in item_data.pop("blocks_obj", {}).values()] - item_info = {k: value for k, value in item_info.items() if value} + item_data = {k: value for k, value in item_data.items() if value} for key in [ "synthesis_constituents", @@ -299,10 +265,8 @@ def _prepare_item_json_for_chat(self, item_id: str): "negative_electrode", "electrolyte", ]: - if key in item_info: - for constituent in item_info[key]: - LOGGER.debug("iterating through constituents:") - LOGGER.debug(constituent) + if key in item_data: + for constituent in item_data[key]: if "quantity" in constituent: constituent["quantity"] = ( f"{constituent.get('quantity', 'unknown')} {constituent.get('unit', '')}" @@ -311,7 +275,7 @@ def _prepare_item_json_for_chat(self, item_id: str): # Note manual replaces to help avoid escape sequences that take up extra tokens item_info_json = ( - json.dumps(item_info, cls=CustomJSONEncoder) + json.dumps(item_data, cls=CustomJSONEncoder) .replace('"', "'") .replace(r"\'", "'") .replace(r"\n", " ") @@ -319,12 +283,16 @@ def _prepare_item_json_for_chat(self, item_id: str): return item_info_json - def _prepare_collection_json_for_chat(self, collection_id: str): + def _prepare_collection_json_for_chat( + self, collection_id: str, collection_data: dict | None = None + ): from pydatalab.routes.v0_1.collections import get_collection - collection_data = get_collection(collection_id).json - if collection_data["status"] != "success": - raise RuntimeError(f"Attempt to get collection data for {collection_id} failed.") + if not collection_data: + collection_data = get_collection(collection_id).json + + if collection_data["status"] != "success": + raise RuntimeError(f"Attempt to get collection data for {collection_id} failed.") children = collection_data["child_items"] return ( diff --git a/pydatalab/src/pydatalab/apps/chat/models.py b/pydatalab/src/pydatalab/apps/chat/models.py new file mode 100644 index 000000000..c1d5f3ce9 --- /dev/null +++ b/pydatalab/src/pydatalab/apps/chat/models.py @@ -0,0 +1,85 @@ +from langchain_anthropic import ChatAnthropic +from langchain_core.language_models import BaseChatModel, ParrotFakeChatModel +from langchain_openai import ChatOpenAI +from pydantic import BaseModel + + +class ModelCard(BaseModel): + name: str + context_window: int + input_cost_usd_per_MTok: float + output_cost_usd_per_MTok: float + chat_client: type[BaseChatModel] + + +__all__ = ("AVAILABLE_MODELS", "ModelCard") + +_AVAILABLE_MODELS = [ + { + "name": "langchain-debug", + "context_window": 200_000_000, + "input_cost_usd_per_MTok": 0.0, + "output_cost_usd_per_MTok": 0.0, + "chat_client": ParrotFakeChatModel, + }, + { + "name": "claude-3-5-sonnet-20241022", + "context_window": 200_000, + "input_cost_usd_per_MTok": 3.00, + "output_cost_usd_per_MTok": 15.00, + "chat_client": ChatAnthropic, + }, + { + "name": "claude-3-haiku-20241022", + "context_window": 200_000, + "input_cost_usd_per_MTok": 1.00, + "output_cost_usd_per_MTok": 5.00, + "chat_client": ChatAnthropic, + }, + { + "name": "claude-3-haiku-20240307", + "context_window": 200_000, + "input_cost_usd_per_MTok": 0.25, + "output_cost_usd_per_MTok": 1.25, + "chat_client": ChatAnthropic, + }, + { + "name": "claude-3-opus-20240229", + "context_window": 200000, + "input_cost_usd_per_MTok": 15.00, + "output_cost_usd_per_MTok": 75.00, + "chat_client": ChatAnthropic, + }, + { + "name": "gpt-4o", + "context_window": 128000, + "input_cost_usd_per_MTok": 5.00, + "output_cost_usd_per_MTok": 15.00, + "chat_client": ChatOpenAI, + }, + { + "name": "gpt-4o-mini", + "context_window": 128_000, + "input_cost_usd_per_MTok": 0.15, + "output_cost_usd_per_MTok": 0.60, + "chat_client": ChatOpenAI, + }, + { + "name": "gpt-4", + "context_window": 8192, + "input_cost_usd_per_MTok": 30.00, + "output_cost_usd_per_MTok": 60.00, + "chat_client": ChatOpenAI, + }, + { + "name": "gpt-4-turbo", + "context_window": 128000, + "input_cost_usd_per_MTok": 10.00, + "output_cost_usd_per_MTok": 30.00, + "chat_client": ChatOpenAI, + }, +] + +AVAILABLE_MODELS: dict[str, ModelCard] = { + model["name"]: ModelCard(**model) for model in _AVAILABLE_MODELS +} diff --git a/pydatalab/src/pydatalab/apps/nmr/blocks.py b/pydatalab/src/pydatalab/apps/nmr/blocks.py index 7fa12a223..7d5d8956e 100644 --- a/pydatalab/src/pydatalab/apps/nmr/blocks.py +++ b/pydatalab/src/pydatalab/apps/nmr/blocks.py @@ -23,6 +23,7 @@ class NMRBlock(DataBlock): description = "A data block for loading and visualizing 1D NMR data from Bruker projects or JCAMP-DX files." accepted_file_extensions = BRUKER_FILE_EXTENSIONS + JCAMP_FILE_EXTENSIONS + processed_data: dict | None = None defaults = {"process number": 1} _supports_collections = False @@ -34,7 +35,7 @@ def read_bruker_nmr_data( self, filename: str | Path | None = None, file_info: dict | None = None, - ) -> tuple[dict | None, dict] | None: + ) -> tuple[dict | None, dict]: """Loads a Bruker project from the passed or attached zip file and parses it into a serialized dataframe and metadata dictionary. @@ -119,7 +120,6 @@ def read_bruker_nmr_data( metadata["title"] = topspin_title self.data["metadata"] = metadata - self.data["processed_data"] = serialized_df return serialized_df, metadata @@ -202,24 +202,26 @@ def read_jcamp_nmr_data( pass serialized_df = df.to_dict() if (df is not None) else None - - self.data["processed_data"] = serialized_df self.data["metadata"] = metadata + return serialized_df, metadata + def load_nmr_data(self, file_info: dict): location, name, ext = self._extract_file_info(file_info=file_info) if ext == ".zip": - self.read_bruker_nmr_data(file_info=file_info) + df, metadata = self.read_bruker_nmr_data(file_info=file_info) elif ext in (".jdx", ".dx"): - self.read_jcamp_nmr_data(file_info=file_info) + df, metadata = self.read_jcamp_nmr_data(file_info=file_info) else: raise RuntimeError( f"Unsupported file extension for NMR reader: {ext} (must be one of {self.accepted_file_extensions})" ) + return df + def generate_nmr_plot(self, parse: bool = True): """Generate an NMR plot and store processed data for the data files attached to this block. @@ -232,7 +234,7 @@ def generate_nmr_plot(self, parse: bool = True): file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True) name, ext = os.path.splitext(file_info["name"]) - self.load_nmr_data(file_info) + self.processed_data = self.load_nmr_data(file_info) processed_data_shape = self.data.get("metadata", {}).get("processed_data_shape", []) if not processed_data_shape or len(processed_data_shape) > 1: @@ -241,14 +243,14 @@ def generate_nmr_plot(self, parse: bool = True): ) return - if "processed_data" not in self.data or not self.data["processed_data"]: + if not self.processed_data: self.data["bokeh_plot_data"] = None warnings.warn( "No compatible processed data available for plotting, only metadata will be displayed." ) return - df = pd.DataFrame(self.data["processed_data"]) + df = pd.DataFrame(self.processed_data) df["normalized intensity"] = df.intensity / df.intensity.max() self.data["bokeh_plot_data"] = self.make_nmr_plot(df, self.data["metadata"]) diff --git a/pydatalab/src/pydatalab/apps/xrd/blocks.py b/pydatalab/src/pydatalab/apps/xrd/blocks.py index 4990ff67c..00c631255 100644 --- a/pydatalab/src/pydatalab/apps/xrd/blocks.py +++ b/pydatalab/src/pydatalab/apps/xrd/blocks.py @@ -262,7 +262,8 @@ def generate_xrd_plot(self, filenames: list[str | Path] | None = None) -> None: pattern_df["normalized intensity (staggered)"] += ind pattern_dfs.append(pattern_df) - self.data["peak_data"] = peak_information + self.data["computed"] = {} + self.data["computed"]["peak_data"] = peak_information elif filenames is None: file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True) @@ -284,9 +285,9 @@ def generate_xrd_plot(self, filenames: list[str | Path] | None = None) -> None: f"{self.data.get('wavelength', self.defaults['wavelength'])} Å" ) peak_model = PeakInformation(**peak_data) - if "peak_data" not in self.data: - self.data["peak_data"] = {} - self.data["peak_data"][str(file_info["immutable_id"])] = peak_model.dict() + if "computed" not in self.data: + self.data["computed"] = {"peak_data": {}} + self.data["computed"]["peak_data"][str(file_info["immutable_id"])] = peak_model.dict() pattern_dfs = [pattern_df] else: @@ -303,9 +304,10 @@ def generate_xrd_plot(self, filenames: list[str | Path] | None = None) -> None: pattern_dfs.append(pattern_df) peak_model = PeakInformation(**peak_data) - if "peak_data" not in self.data: - self.data["peak_data"] = {} - self.data["peak_data"][f] = peak_model.dict() + if "computed" not in self.data: + self.data["computed"] = {"peak_data": {}} + self.data["computed"]["peak_data"][f] = peak_model.dict() + pattern_dfs = [pattern_df] if pattern_dfs: p = self._make_plots(pattern_dfs, y_options) diff --git a/pydatalab/src/pydatalab/blocks/base.py b/pydatalab/src/pydatalab/blocks/base.py index 25e86925c..25034a5f7 100644 --- a/pydatalab/src/pydatalab/blocks/base.py +++ b/pydatalab/src/pydatalab/blocks/base.py @@ -1,4 +1,5 @@ import functools +import json import random import warnings from collections.abc import Callable, Sequence @@ -6,6 +7,7 @@ from pydatalab import __version__ from pydatalab.logger import LOGGER +from pydatalab.models.blocks import DataBlockResponse __all__ = ("generate_random_id", "DataBlock", "generate_js_callback_single_float_parameter") @@ -100,6 +102,8 @@ def generate_random_id(): class DataBlock: """Base class for a data block.""" + block_db_model = DataBlockResponse + name: str = "base" """The human-readable block name specifying which technique or file format it pertains to. @@ -191,39 +195,14 @@ def __init__( collection_id, ) - def to_db(self): + def to_db(self) -> dict: """returns a dictionary with the data for this block, ready to be input into mongodb""" - from bson import ObjectId LOGGER.debug("Casting block %s to database object.", self.__class__.__name__) - - self.data.pop("bokeh_plot_data", None) - self.data.pop("b64_encoded_image", None) - - if "file_id" in self.data: - dict_for_db = self.data.copy() # gross, I know - dict_for_db["file_id"] = ObjectId(dict_for_db["file_id"]) - return dict_for_db - - return self.data - - @classmethod - def from_db(cls, block: dict): - """Create a block from data stored in the database.""" - LOGGER.debug("Loading block %s from database object.", cls.__class__.__name__) - new_block = cls( - item_id=block.get("item_id"), - collection_id=block.get("collection_id"), - init_data=block, + return self.block_db_model(**self.data).dict( + exclude={"bokeh_plot_data", "b64_encoded_image"}, exclude_unset=True ) - if "file_id" in new_block.data: - new_block.data["file_id"] = str(new_block.data["file_id"]) - - if new_block.data.get("title", "") == new_block.description: - new_block.data["title"] = new_block.name - - return new_block def to_web(self) -> dict[str, Any]: """Returns a JSON serializable dictionary to render the data block on the web.""" @@ -258,7 +237,9 @@ def to_web(self) -> dict[str, Any]: else: self.data.pop("warnings", None) - return self.data + model = self.block_db_model(**self.data) + serialized = json.loads(model.json(exclude_unset=True, exclude_none=True)) + return serialized def process_events(self, events: list[dict] | dict): """Handle any supported events passed to the block.""" @@ -324,7 +305,6 @@ def from_web(cls, data: dict): data: The block data to initialiaze the block with. """ - LOGGER.debug("Loading block %s from web request.", cls.__class__.__name__) block = cls( item_id=data.get("item_id"), collection_id=data.get("collection_id"), @@ -334,9 +314,9 @@ def from_web(cls, data: dict): return block def update_from_web(self, data: dict): - """Update the block with data received from a web request. - - Only updates fields that are specified in the dictionary - other fields are left alone + """Update the block with validated data received from a web request. + Will strip any fields that are "computed" or otherwise not controllable + by the user. Parameters: data: A dictionary of data to update the block with. @@ -346,6 +326,11 @@ def update_from_web(self, data: dict): "Updating block %s from web request", self.__class__.__name__, ) - self.data.update(data) + self.data.update( + self.block_db_model(**data).dict( + exclude={"computed", "metadata", "bokeh_plot_data", "b64_encoded_image"}, + exclude_unset=True, + ) + ) return self diff --git a/pydatalab/src/pydatalab/models/blocks.py b/pydatalab/src/pydatalab/models/blocks.py index cec98577c..d4ecfaac3 100644 --- a/pydatalab/src/pydatalab/models/blocks.py +++ b/pydatalab/src/pydatalab/models/blocks.py @@ -6,6 +6,9 @@ class DataBlockResponse(BaseModel): """A generic response model for a block, i.e., what is stored in `self.data` in the corresponding DataBlock class. + + It is expected but not mandatory that this model will be extended by the specific block type + where possible. """ blocktype: str @@ -32,12 +35,28 @@ class DataBlockResponse(BaseModel): file_ids: list[PyObjectId] | None = None """A list of file IDs associated with the block, if any.""" - b64_encoded_image: dict[PyObjectId, str] | None = None - """Any base64-encoded image data associated with the block, keyed by file_id, if any.""" + errors: list[str] | None = None + """Any errors that occurred during block processing.""" + + warnings: list[str] | None = None + """Any warnings that occurred during block processing.""" + + b64_encoded_image: dict[str, str] | None + """Any base64-encoded image data associated with the block, keyed by `file_id`.""" - bokeh_plot_data: str | None = None + bokeh_plot_data: dict | None """A JSON-encoded string containing the Bokeh plot data, if any.""" + computed: dict | None = None + """Any processed or computed data associated with the block, small enough to store and filter directly in the database, + i.e., strings or a few hundred numbers not exceeding 16KB in size. + Examples could include peak positions, and widths, but not the full spectrum. + """ + + metadata: dict | None = None + """Any structured metadata associated with the block, for example, + experimental acquisition parameters.""" + class Config: allow_population_by_field_name = True json_encoders = JSON_ENCODERS diff --git a/pydatalab/src/pydatalab/routes/v0_1/blocks.py b/pydatalab/src/pydatalab/routes/v0_1/blocks.py index 2e014ceed..131ac56fe 100644 --- a/pydatalab/src/pydatalab/routes/v0_1/blocks.py +++ b/pydatalab/src/pydatalab/routes/v0_1/blocks.py @@ -33,10 +33,6 @@ def add_data_block(): block = BLOCK_TYPES[block_type](item_id=item_id) - data = block.to_db() - - # currently, adding to both blocks and blocks_obj to mantain compatibility with - # the old site. The new site only uses blocks_obj if insert_index: display_order_update = { "$each": [block.block_id], @@ -48,8 +44,8 @@ def add_data_block(): result = flask_mongo.db.items.update_one( {"item_id": item_id, **get_default_permissions(user_only=True)}, { - "$push": {"blocks": data, "display_order": display_order_update}, - "$set": {f"blocks_obj.{block.block_id}": data}, + "$push": {"display_order": display_order_update}, + "$set": {f"blocks_obj.{block.block_id}": block.to_db()}, }, ) @@ -140,7 +136,7 @@ def add_collection_data_block(): ) -def _save_block_to_db(block: DataBlock) -> bool: +def _save_block_to_db(block: DataBlock) -> None: """Save data for a single block within an item to the database, overwriting previous data saved there. @@ -171,8 +167,6 @@ def _save_block_to_db(block: DataBlock) -> bool: f"Failed to save block, likely because item_id ({block.data.get('item_id')}), collection_id ({block.data.get('collection_id')}) and/or block_id ({block.block_id}) wasn't found" ) - return True - @BLOCKS.route("/update-block/", methods=["POST"]) @BLOCKS.route("/blocks/", methods=["POST"]) @@ -185,7 +179,7 @@ def update_block(): request_json = request.get_json() block_data = request_json["block_data"] event_data = request_json.get("event_data", None) - save_to_db = request_json.get("save_to_db", False) + block_type = block_data["blocktype"] if block_type not in BLOCK_TYPES: @@ -201,14 +195,17 @@ def update_block(): except NotImplementedError: pass - saved_successfully = False - if save_to_db: - saved_successfully = _save_block_to_db(block) + # Save state from UI + _save_block_to_db(block) + + # Reload the block with new UI state + new_block_data = block.to_web() + + # Save results to DB + _save_block_to_db(block) return ( - jsonify( - status="success", saved_successfully=saved_successfully, new_block_data=block.to_web() - ), + jsonify(status="success", saved_successfully=True, new_block_data=new_block_data), 200, ) diff --git a/pydatalab/src/pydatalab/routes/v0_1/items.py b/pydatalab/src/pydatalab/routes/v0_1/items.py index f9d1c1d23..a886f57e2 100644 --- a/pydatalab/src/pydatalab/routes/v0_1/items.py +++ b/pydatalab/src/pydatalab/routes/v0_1/items.py @@ -28,61 +28,6 @@ def _(): ... -def reserialize_blocks(display_order: list[str], blocks_obj: dict[str, dict]) -> dict[str, dict]: - """Create the corresponding Python objects from JSON block data, then - serialize it again as JSON to populate any missing properties. - - Parameters: - blocks_obj: A dictionary containing the JSON block data, keyed by block ID. - - Returns: - A dictionary with the re-serialized block data. - - """ - for block_id in display_order: - try: - block_data = blocks_obj[block_id] - except KeyError: - LOGGER.warning(f"block_id {block_id} found in display order but not in blocks_obj") - continue - blocktype = block_data["blocktype"] - blocks_obj[block_id] = ( - BLOCK_TYPES.get(blocktype, BLOCK_TYPES["notsupported"]).from_db(block_data).to_web() - ) - - return blocks_obj - - -# Seems to be obselete now? -def dereference_files(file_ids: list[str | ObjectId]) -> dict[str, dict]: - """For a list of Object IDs (as strings or otherwise), query the files collection - and return a dictionary of the data stored under each ID. - - Parameters: - file_ids: The list of IDs of files to return; - - Returns: - The dereferenced data as a dictionary with (string) ID keys. - - """ - results = { - str(f["_id"]): f - for f in flask_mongo.db.files.find( - { - "_id": {"$in": [ObjectId(_id) for _id in file_ids]}, - } - ) - } - if len(results) != len(file_ids): - raise RuntimeError( - "Some file IDs did not have corresponding database entries.\n" - f"Returned: {list(results.keys())}\n" - f"Requested: {file_ids}\n" - ) - - return results - - @ITEMS.route("/equipment/", methods=["GET"]) def get_equipment_summary(): _project = { @@ -832,17 +777,9 @@ def delete_sample(): @ITEMS.route("/items/", methods=["GET"]) @ITEMS.route("/get-item-data/", methods=["GET"]) -def get_item_data( - item_id: str | None = None, refcode: str | None = None, load_blocks: bool = False -): +def get_item_data(item_id: str | None = None, refcode: str | None = None): """Generates a JSON response for the item with the given `item_id`, or `refcode` additionally resolving relationships to files and other items. - - Parameters: - load_blocks: Whether to regenerate any data blocks associated with this - sample (i.e., create the Python object corresponding to the block and - call its render function). - """ redirect_to_ui = bool(request.args.get("redirect-to-ui", default=False, type=json.loads)) if refcode and redirect_to_ui and CONFIG.APP_URL: @@ -911,8 +848,6 @@ def get_item_data( raise KeyError(f"Item {item_id=} has no type field in document.") doc = ItemModel(**doc) - if load_blocks: - doc.blocks_obj = reserialize_blocks(doc.display_order, doc.blocks_obj) # find any documents with relationships that mention this document relationships_query_results = flask_mongo.db.items.find( diff --git a/pydatalab/tests/apps/test_chat.py b/pydatalab/tests/apps/test_chat.py new file mode 100644 index 000000000..8285dc632 --- /dev/null +++ b/pydatalab/tests/apps/test_chat.py @@ -0,0 +1,32 @@ +from pydatalab.apps.chat import ChatBlock + + +def test_chatblock(): + chat = ChatBlock(item_id="test") + chat.data["model"] = "langchain-debug" + + chat.start_conversation(item_data={"item_data": {"type": "samples", "item_id": "test"}}) + + assert chat.data["messages"][0]["content"].startswith("You are whinchat (lowercase w)") + assert chat.data["messages"][1]["content"].startswith("Here is the JSON data for") + + chat.render() + assert chat.data["messages"][2]["role"] == "assistant" + + chat.data["prompt"] = "Hello there" + chat.render() + + assert chat.data["messages"][-1] == {"role": "assistant", "content": "Hello there"} + assert chat.data["messages"][-2] == {"role": "user", "content": "Hello there"} + assert chat.data["messages"][0]["content"].startswith("You are whinchat (lowercase w)") + assert chat.data["messages"][1]["content"].startswith("Here is the JSON data for") + assert len(chat.data["messages"]) == 5 + + chat.data["prompt"] = "Hello there again" + chat.render() + + assert chat.data["messages"][-1] == {"role": "assistant", "content": "Hello there again"} + assert chat.data["messages"][-2] == {"role": "user", "content": "Hello there again"} + assert chat.data["messages"][0]["content"].startswith("You are whinchat (lowercase w)") + assert chat.data["messages"][1]["content"].startswith("Here is the JSON data for") + assert len(chat.data["messages"]) == 7 diff --git a/pydatalab/tests/apps/test_nmr.py b/pydatalab/tests/apps/test_nmr.py index c0c933801..4eea765ca 100644 --- a/pydatalab/tests/apps/test_nmr.py +++ b/pydatalab/tests/apps/test_nmr.py @@ -91,28 +91,30 @@ def test_nmr_block( nmr_1d_solution_path, nmr_1d_solution_path_renamed, nmr_1d_solid_path, nmr_2d_matpass_path ): block = NMRBlock(item_id="nmr-block") - block.read_bruker_nmr_data(nmr_1d_solid_path) + block.processed_data, block.data["metadata"] = block.read_bruker_nmr_data(nmr_1d_solid_path) assert block.data["metadata"]["topspin_title"].split("\n")[0] == "7Li 40 kHz 40 C hahn-echo" block.generate_nmr_plot(parse=False) plot = block.data["bokeh_plot_data"] assert plot is not None block = NMRBlock(item_id="nmr-block") - block.read_bruker_nmr_data(nmr_1d_solution_path) + block.processed_data, block.data["metadata"] = block.read_bruker_nmr_data(nmr_1d_solution_path) assert block.data["metadata"]["topspin_title"].split("\n")[0] == "31P reference, 85% H3PO4" block.generate_nmr_plot(parse=False) plot = block.data["bokeh_plot_data"] assert plot is not None block = NMRBlock(item_id="nmr-block") - block.read_bruker_nmr_data(nmr_1d_solution_path_renamed) + block.processed_data, block.data["metadata"] = block.read_bruker_nmr_data( + nmr_1d_solution_path_renamed + ) assert block.data["metadata"]["topspin_title"].split("\n")[0] == "31P reference, 85% H3PO4" block.generate_nmr_plot(parse=False) plot = block.data["bokeh_plot_data"] assert plot is not None block = NMRBlock(item_id="nmr-block") - block.read_bruker_nmr_data(nmr_2d_matpass_path) + block.processed_data, block.data["metadata"] = block.read_bruker_nmr_data(nmr_2d_matpass_path) assert block.data["metadata"]["topspin_title"].split("\n")[0] == "7Li 40kHz 40 C MATPASS" # catch warning about processed data with pytest.warns(UserWarning, match="Only metadata"): diff --git a/pydatalab/tests/server/conftest.py b/pydatalab/tests/server/conftest.py index 43fc0d923..c8a7e2fe7 100644 --- a/pydatalab/tests/server/conftest.py +++ b/pydatalab/tests/server/conftest.py @@ -71,6 +71,7 @@ def app_config(tmp_path_factory): "REMOTE_FILESYSTEMS": example_remotes, "FILE_DIRECTORY": str(tmp_path_factory.mktemp("files")), "TESTING": False, + "ROOT_PATH": "/", "EMAIL_AUTH_SMTP_SETTINGS": { "MAIL_SERVER": "smtp.example.com", "MAIL_PORT": 587, diff --git a/pydatalab/tests/server/test_blocks.py b/pydatalab/tests/server/test_blocks.py index 697e8063f..9f2ec5135 100644 --- a/pydatalab/tests/server/test_blocks.py +++ b/pydatalab/tests/server/test_blocks.py @@ -1,11 +1,8 @@ -from pathlib import Path - import pytest from pydatalab.apps import BLOCK_TYPES, BLOCKS -@pytest.mark.dependency() def test_get_all_available_block_types(): """Test that we can enumerate all available block types.""" assert len(BLOCKS) > 0 @@ -21,7 +18,6 @@ def test_get_all_available_block_types(): assert BLOCK_TYPES[block_class.blocktype] == block_class -@pytest.mark.dependency(depends=["test_get_all_available_block_types"]) @pytest.mark.parametrize("block_type", list(BLOCK_TYPES.keys())) def test_create_sample_with_each_block_type(admin_client, block_type, default_sample_dict): """Test creating a sample and adding each available block type via API.""" @@ -30,9 +26,7 @@ def test_create_sample_with_each_block_type(admin_client, block_type, default_sa sample_data["item_id"] = sample_id response = admin_client.post("/new-sample/", json=sample_data) - assert response.status_code == 201, ( - f"Failed to create sample for {block_type}: {response.json()}" - ) + assert response.status_code == 201, f"Failed to create sample for {block_type}: {response.json}" assert response.json["status"] == "success" response = admin_client.post( @@ -44,7 +38,7 @@ def test_create_sample_with_each_block_type(admin_client, block_type, default_sa }, ) - assert response.status_code == 200, f"Failed to add {block_type} block: {response.json()}" + assert response.status_code == 200, f"Failed to add {block_type} block: {response.json}" assert response.json["status"] == "success" assert response.json["new_block_obj"] assert response.json["new_block_obj"]["blocktype"] == block_type @@ -61,7 +55,6 @@ def test_create_sample_with_each_block_type(admin_client, block_type, default_sa assert first_block["blocktype"] == block_type -@pytest.mark.dependency(depends=["test_get_all_available_block_types"]) def test_invalid_block_type(admin_client, default_sample_dict): """Test that invalid block types are rejected.""" sample_id = "test_sample_invalid_block" @@ -84,7 +77,6 @@ def test_invalid_block_type(admin_client, default_sample_dict): assert "Invalid block type" in response.json["message"] -@pytest.mark.dependency(depends=["test_get_all_available_block_types"]) def test_add_multiple_blocks_to_sample(admin_client, default_sample_dict): """Test adding multiple different block types to the same sample.""" sample_id = "test_sample_multiple_blocks" @@ -169,7 +161,6 @@ def test_block_permissions(client, admin_client, unauthenticated_client, default assert response.status_code == 401 -@pytest.mark.dependency(depends=["test_get_all_available_block_types"]) def test_add_block_to_nonexistent_item(admin_client): """Test that adding a block to a nonexistent item fails gracefully.""" block_type = list(BLOCK_TYPES.keys())[0] @@ -187,7 +178,6 @@ def test_add_block_to_nonexistent_item(admin_client): assert "Update failed" in response.json["message"] -@pytest.mark.dependency(depends=["test_get_all_available_block_types"]) def test_block_info_endpoint_contains_all_blocks(client): """Test that the /info/blocks endpoint returns all available block types.""" response = client.get("/info/blocks") @@ -201,11 +191,10 @@ def test_block_info_endpoint_contains_all_blocks(client): ) -@pytest.mark.dependency(depends=["test_get_all_available_block_types"]) -def test_create_sample_with_example_files(admin_client, default_sample_dict): - """Create a test sample with multiple block types and attached example files.""" +def test_uvvis_block_lifecycle(admin_client, default_sample_dict, example_data_dir): + block_type = "uv-vis" - sample_id = "test_sample_with_files" + sample_id = f"test_sample_with_files-{block_type}-lifecycle" sample_data = default_sample_dict.copy() sample_data["item_id"] = sample_id @@ -213,182 +202,345 @@ def test_create_sample_with_example_files(admin_client, default_sample_dict): assert response.status_code == 201 assert response.json["status"] == "success" - block_file_mapping = { - "tabular": "csv", - "cycle": "echem", - "ftir": "FTIR", - "nmr": "NMR", - "raman": "raman", - "ms": "TGA-MS", - "uv-vis": "UV-Vis", - "xrd": "XRD", - "media": "media", - } - - preferred_files = { - "cycle": "jdb11-1_c3_gcpl_5cycles_2V-3p8V_C-24_data_C09.mpr", - "tabular": "simple.csv", - "ftir": "2024-10-10_FeSO4_ref.asp", - "nmr": "1.zip", - "raman": "raman_example.txt", - "ms": "20221128 134958 TGA MS Megan.asc", - "xrd": "Scan_C4.xrdml", - "media": "grey_group_logo.tif", - } - - example_data_path = Path(__file__).parent.parent.parent / "example_data" - added_blocks = {} - uploaded_files = [] - block_index = 0 - - for block_type, folder_name in block_file_mapping.items(): + response = admin_client.post( + "/add-data-block/", + json={ + "block_type": block_type, + "item_id": sample_id, + "index": 0, + }, + ) + + assert response.status_code == 200, f"Failed to add {block_type} block: {response.json}" + assert response.json["status"] == "success" + + block_data = response.json["new_block_obj"] + block_id = block_data["block_id"] + + uvvis_folder = example_data_dir / "UV-Vis" + + example_files = uvvis_folder.glob("*") + example_file_ids = [] + + for example_file in example_files: + with open(example_file, "rb") as f: + response = admin_client.post( + "/upload-file/", + buffered=True, + content_type="multipart/form-data", + data={ + "item_id": sample_id, + "file": [(f, example_file.name)], + "type": "application/octet-stream", + "replace_file": "null", + "relativePath": "null", + }, + ) + assert response.status_code == 201, f"Failed to upload {example_file.name}" + assert response.json["status"] == "success" + file_id = response.json["file_id"] + example_file_ids.append(file_id) + + assert len(example_file_ids) == 3 + + response = admin_client.get(f"/get-item-data/{sample_id}") + assert response.status_code == 200 + item_data = response.json["item_data"] + block_data = item_data["blocks_obj"][block_id] + block_data["file_id"] = example_file_ids[0] + block_data["selected_file_order"] = example_file_ids + + response = admin_client.post("/update-block/", json={"block_data": block_data}) + + assert response.status_code == 200 + web_block = response.json["new_block_data"] + assert "bokeh_plot_data" in web_block + assert web_block.get("errors") is None + + +def test_xrd_block_lifecycle(admin_client, default_sample_dict, example_data_dir): + from pydatalab.apps.xrd import XRDBlock + + block_type = "xrd" + + sample_id = f"test_sample_with_files-{block_type}-lifecycle" + sample_data = default_sample_dict.copy() + sample_data["item_id"] = sample_id + + response = admin_client.post("/new-sample/", json=sample_data) + assert response.status_code == 201 + assert response.json["status"] == "success" + + response = admin_client.post( + "/add-data-block/", + json={ + "block_type": block_type, + "item_id": sample_id, + "index": 0, + }, + ) + + assert response.status_code == 200, f"Failed to add {block_type} block: {response.json}" + assert response.json["status"] == "success" + + block_data = response.json["new_block_obj"] + block_id = block_data["block_id"] + + block_file = "XRD/cod_9004112.cif" + + example_file = example_data_dir / block_file + + with open(example_file, "rb") as f: response = admin_client.post( - "/add-data-block/", - json={ - "block_type": block_type, + "/upload-file/", + buffered=True, + content_type="multipart/form-data", + data={ "item_id": sample_id, - "index": block_index, + "file": [(f, example_file.name)], + "type": "application/octet-stream", + "replace_file": "null", + "relativePath": "null", }, ) - assert response.status_code == 200, f"Failed to add {block_type} block: {response.json()}" - assert response.json["status"] == "success" + assert response.status_code == 201, f"Failed to upload {example_file.name}" + assert response.json["status"] == "success" + file_id = response.json["file_id"] - block_data = response.json["new_block_obj"] - block_id = block_data["block_id"] - added_blocks[block_type] = {"block_id": block_id, "index": block_index} - - if block_type == "uv-vis": - folder_path = example_data_path / folder_name - if folder_path.exists(): - uv_files = list(folder_path.glob("*")) - assert len(uv_files) >= 2, f"UV-Vis needs at least 2 files, found {len(uv_files)}" - - file_ids = [] - for i, uv_file in enumerate(uv_files[:2]): - with open(uv_file, "rb") as f: - response = admin_client.post( - "/upload-file/", - buffered=True, - content_type="multipart/form-data", - data={ - "item_id": sample_id, - "file": [(f, uv_file.name)], - "type": "application/octet-stream", - "replace_file": "null", - "relativePath": "null", - }, - ) - - if response.status_code == 201: - assert response.json["status"] == "success" - file_id = response.json["file_id"] - file_ids.append(file_id) - uploaded_files.append({"block_type": block_type, "filename": uv_file.name}) - - if file_ids: - response = admin_client.get(f"/get-item-data/{sample_id}") - assert response.status_code == 200 - item_data = response.json["item_data"] - block_data = item_data["blocks_obj"][block_id] - block_data["file_id"] = file_ids[0] - block_data["selected_file_order"] = file_ids - - response = admin_client.post( - "/update-block/", json={"block_data": block_data, "save_to_db": True} - ) - - assert response.status_code == 200 - - else: - folder_path = example_data_path / folder_name - if folder_path.exists(): - files_in_folder = list(folder_path.glob("*")) - assert len(files_in_folder) > 0, f"No files found in {folder_path}" - - if block_type in preferred_files: - preferred_file = folder_path / preferred_files[block_type] - if preferred_file.exists(): - example_file = preferred_file - else: - example_file = files_in_folder[0] - else: - example_file = files_in_folder[0] - - with open(example_file, "rb") as f: - response = admin_client.post( - "/upload-file/", - buffered=True, - content_type="multipart/form-data", - data={ - "item_id": sample_id, - "file": [(f, example_file.name)], - "type": "application/octet-stream", - "replace_file": "null", - "relativePath": "null", - }, - ) - - assert response.status_code == 201, f"Failed to upload {example_file.name}" - assert response.json["status"] == "success" - file_id = response.json["file_id"] - - response = admin_client.get(f"/get-item-data/{sample_id}") - assert response.status_code == 200 - item_data = response.json["item_data"] - block_data = item_data["blocks_obj"][block_id] - block_data["file_id"] = file_id - - response = admin_client.post( - "/update-block/", json={"block_data": block_data, "save_to_db": True} - ) - assert response.status_code == 200 - uploaded_files.append({"block_type": block_type, "filename": example_file.name}) - - # For the media block, ensure we have a base64 encoded b64_encoded_image - # then try to save it back to the item - if block_type == "media": - block_data = response.json["new_block_data"] - assert "b64_encoded_image" in block_data - - response = admin_client.get(f"/get-item-data/{sample_id}") - assert response.status_code == 200 - item_data = response.json["item_data"] - - item_data["blocks_obj"][block_id] = block_data - response = admin_client.post( - "/save-item/", json={"item_id": sample_id, "data": item_data} - ) - assert response.status_code == 200 - - block_index += 1 - - response = admin_client.get(f"/get-item-data/{sample_id}?load_blocks=1") + response = admin_client.get(f"/get-item-data/{sample_id}") + assert response.status_code == 200 + item_data = response.json["item_data"] + block_data = item_data["blocks_obj"][block_id] + block_data["file_id"] = file_id + block_data["wavelength"] = 2.0 + + response = admin_client.post("/update-block/", json={"block_data": block_data}) + + web_block = response.json["new_block_data"] + assert "bokeh_plot_data" in web_block + assert "computed" in web_block + assert web_block["wavelength"] == 2.0 + assert "peak_data" in web_block["computed"] + assert "file_id" in web_block + assert web_block["file_id"] == file_id + assert web_block.get("errors") is None + + block = XRDBlock.from_web(web_block) + db = block.to_db() + # 'computed' keys should be dropped when loading from web + assert "bokeh_plot_data" not in db + assert "computed" not in db + assert db["wavelength"] == 2.0 + + # But they should still be in the database + response = admin_client.get(f"/get-item-data/{sample_id}") assert response.status_code == 200 - assert response.json["status"] == "success" item_data = response.json["item_data"] + assert response.json["status"] == "success" assert "blocks_obj" in item_data - assert len(item_data["blocks_obj"]) == len(added_blocks) - assert len(item_data["display_order"]) == len(added_blocks) + block = item_data["blocks_obj"][block_id] + assert "computed" in block + assert "peak_data" in block["computed"] + assert block["wavelength"] == 2.0 + + +def test_comment_block_manipulation(admin_client, default_sample_dict, database): + """Create a test sample with a comment block and test it for + dealing with unhandled data.""" + + block_type = "comment" + + sample_id = "test_sample_with_files-comment" + sample_data = default_sample_dict.copy() + sample_data["item_id"] = sample_id + + response = admin_client.post("/new-sample/", json=sample_data) + assert response.status_code == 201 + assert response.json["status"] == "success" + + response = admin_client.post( + "/add-data-block/", + json={ + "block_type": block_type, + "item_id": sample_id, + "index": 0, + }, + ) + + assert response.status_code == 200, f"Failed to add {block_type} block: {response.json}" + assert response.json["status"] == "success" - assert "file_ObjectIds" in item_data - assert len(item_data["file_ObjectIds"]) >= len(uploaded_files) + block_data = response.json["new_block_obj"] + block_id = block_data["block_id"] + block_data["freeform_comment"] = "This is a test comment block." + block_data["title"] = "Test Comment Block" + block_data["errors"] = ["Test Network Failure"] - block_types_in_sample = [block["blocktype"] for block in item_data["blocks_obj"].values()] - expected_block_types = list(block_file_mapping.keys()) + response = admin_client.post("/update-block/", json={"block_data": block_data}) + assert response.status_code == 200 + assert response.json["status"] == "success" + assert response.json["new_block_data"]["blocktype"] == block_type + assert response.json["new_block_data"]["freeform_comment"] == "This is a test comment block." + assert response.json["new_block_data"]["title"] == "Test Comment Block" + assert response.json["new_block_data"].get("errors") is None - for expected_type in expected_block_types: - assert expected_type in block_types_in_sample + # Check that this result was actually stored + response = admin_client.get(f"/get-item-data/{sample_id}") + assert response.status_code == 200 + item_data = response.json["item_data"] + assert response.json["status"] == "success" + assert ( + response.json["item_data"]["blocks_obj"][block_id]["freeform_comment"] + == "This is a test comment block." + ) + assert "errors" not in response.json["item_data"]["blocks_obj"][block_id] + # Try to add some bad data + block_data["bokeh_plot_data"] = {"bokeh": "json"} + block_data["random_new_key"] = "test new key" + block_data["freeform_comment"] = "This is a test comment block with extra data." + response = admin_client.post("/update-block/", json={"block_data": block_data}) + assert response.status_code == 200 + assert response.json["status"] == "success" + assert response.json["new_block_data"]["blocktype"] == block_type + assert ( + response.json["new_block_data"]["freeform_comment"] + == "This is a test comment block with extra data." + ) + assert response.json["new_block_data"]["title"] == "Test Comment Block" + assert "bokeh_plot_data" not in response.json["new_block_data"] + + # Extra random keys will be in the response (in case they are parameters for the block that are not yet handled, + # but they will not be stored in the database) + assert "random_new_key" in response.json["new_block_data"] + + raw_item = database.items.find_one({"item_id": sample_id}) + assert raw_item + raw_block = raw_item["blocks_obj"][block_id] + assert "bokeh_plot_data" not in raw_block + # assert "random_new_key" not in raw_block + assert "errors" not in raw_block + + # Finally, try to update using the save-item endpoint, and make sure any bad data gets stripped out + item_data["blocks_obj"][block_id]["bokeh_plot_data"] = {"bokeh": "json"} + item_data["blocks_obj"][block_id]["random_new_key"] = "test new key again" + item_data["blocks_obj"][block_id]["freeform_comment"] = "This is the latest test comment." + + admin_client.post("/save-item/", json={"item_id": sample_id, "data": item_data}) + assert response.status_code == 200 + + response = admin_client.get(f"/get-item-data/{sample_id}") + assert response.status_code == 200 + assert response.json["status"] == "success" + item_data = response.json["item_data"] + block = item_data["blocks_obj"][block_id] + + assert block["freeform_comment"] == "This is the latest test comment." + assert block.get("bokeh_plot_data") is None + assert block["random_new_key"] == "test new key again" + + +@pytest.mark.parametrize( + "block_type, block_file", + [ + ("tabular", "csv/simple.csv"), + ("cycle", "echem/jdb11-1_c3_gcpl_5cycles_2V-3p8V_C-24_data_C09.mpr"), + ("ftir", "FTIR/2024-10-10_FeSO4_ref.asp"), + ("nmr", "NMR/1.zip"), + ("raman", "raman/raman_example.txt"), + ("ms", "TGA-MS/20221128 134958 TGA MS Megan.asc"), + ("xrd", "XRD/Scan_C4.xrdml"), + ("media", "media/grey_group_logo.tif"), + ], +) +def test_create_sample_with_example_files( + block_type, block_file, admin_client, example_data_dir, default_sample_dict, database +): + """Create a test sample with a block with file uploaded and test for errors.""" + + sample_id = f"test_sample_with_files-{block_type}" + sample_data = default_sample_dict.copy() + sample_data["item_id"] = sample_id + + response = admin_client.post("/new-sample/", json=sample_data) + assert response.status_code == 201 + assert response.json["status"] == "success" + + response = admin_client.post( + "/add-data-block/", + json={ + "block_type": block_type, + "item_id": sample_id, + "index": 0, + }, + ) + + assert response.status_code == 200, f"Failed to add {block_type} block: {response.json}" + assert response.json["status"] == "success" + + block_data = response.json["new_block_obj"] + block_id = block_data["block_id"] + + example_file = example_data_dir / block_file + + with open(example_file, "rb") as f: + response = admin_client.post( + "/upload-file/", + buffered=True, + content_type="multipart/form-data", + data={ + "item_id": sample_id, + "file": [(f, example_file.name)], + "type": "application/octet-stream", + "replace_file": "null", + "relativePath": "null", + }, + ) + + assert response.status_code == 201, f"Failed to upload {example_file.name}" + assert response.json["status"] == "success" + file_id = response.json["file_id"] + + response = admin_client.get(f"/get-item-data/{sample_id}") + assert response.status_code == 200 + item_data = response.json["item_data"] + block_data = item_data["blocks_obj"][block_id] + block_data["file_id"] = file_id + + response = admin_client.post("/update-block/", json={"block_data": block_data}) + assert response.status_code == 200 + assert response.json["status"] == "success" + assert response.json["new_block_data"]["blocktype"] == block_type + + # Some specific checks for different block types: if block_type != "media": assert response.json["new_block_data"]["bokeh_plot_data"] is not None - blocks_with_files = sum(1 for block in item_data["blocks_obj"].values() if block.get("file_id")) - blocks_without_files = [ - block["blocktype"] for block in item_data["blocks_obj"].values() if not block.get("file_id") - ] + if block_type == "xrd": + assert response.json["new_block_data"]["computed"]["peak_data"] is not None - assert blocks_with_files >= len(uploaded_files) // 2, ( - f"Not enough blocks have files attached. Blocks without files: {blocks_without_files}" - ) + # For the media block, check that a TIF image is present and can be saved correctly + if block_type == "media": + block_data = response.json["new_block_data"] + assert "b64_encoded_image" in block_data + + response = admin_client.get(f"/get-item-data/{sample_id}") + assert response.status_code == 200 + item_data = response.json["item_data"] + + item_data["blocks_obj"][block_id] = block_data + response = admin_client.post("/save-item/", json={"item_id": sample_id, "data": item_data}) + assert response.status_code == 200 + + response = admin_client.get(f"/get-item-data/{sample_id}") + assert response.status_code == 200 + assert response.json["status"] == "success" + + item_data = response.json["item_data"] + assert "blocks_obj" in item_data + + if block_type == "xrd": + doc = database.items.find_one({"item_id": sample_id}, projection={"blocks_obj": 1}) + assert doc["blocks_obj"][block_id]["computed"]["peak_data"] is not None diff --git a/webapp/src/server_fetch_utils.js b/webapp/src/server_fetch_utils.js index 6e0ad3bb7..a41a35531 100644 --- a/webapp/src/server_fetch_utils.js +++ b/webapp/src/server_fetch_utils.js @@ -504,20 +504,15 @@ export async function getCollectionData(collection_id) { }); } -export async function updateBlockFromServer( - item_id, - block_id, - block_data, - event_data = null, - saveToDatabase = true, -) { +export async function updateBlockFromServer(item_id, block_id, block_data, event_data = null) { // Send the current block state to the API and receive an updated version // of the block in return, including any event data. // // - Will strip known "large data" keys, even if not formalised, e.g., bokeh_plot_data. // delete block_data.bokeh_plot_data; - delete block_data.processed_data; + delete block_data.b64_encoded_image; + delete block_data.computed; delete block_data.metadata; store.commit("setBlockUpdating", block_id); @@ -526,7 +521,6 @@ export async function updateBlockFromServer( block_id: block_id, block_data: block_data, event_data: event_data, - save_to_db: saveToDatabase, }) .then(function (response_json) { store.commit("updateBlockData", { @@ -592,6 +586,26 @@ export function updateItemPermissions(refcode, creators) { export function saveItem(item_id) { var item_data = store.state.all_item_data[item_id]; + + let blocks = []; + let keysToExclude = ["bokeh_plot_data", "computed", "metadata", "b64_encoded_image"]; + + // Strip large data from blocks before saving, but make + // sure to preserve them in the store + if (item_data.blocks_obj) { + for (const block_id in item_data.blocks_obj) { + blocks.push( + Object.fromEntries( + Object.entries(item_data.blocks_obj[block_id]).filter( + ([key]) => !keysToExclude.includes(key), + ), + ), + ); + } + } + + item_data.blocks = blocks; + store.commit("setItemSaved", { item_id: item_id, isSaved: false }); fetch_post(`${API_URL}/save-item/`, { item_id: item_id,