Skip to content
Discussion options

You must be logged in to vote

ms_agent\tools\docling\chunker.py

def __init__(self,
                 #embed_model_id: str = EMBED_MODEL_ID,
                 max_tokens: int = MAX_TOKENS):
        """
        Hybrid chunker that splits interleaved picture, table, and text into chunks.

        """

        embed_model_path = '/home/jsxyhelu/.cache/modelscope/hub/models/sentence-transformers/all-MiniLM-L6-v2'  # 本地模型路径
        self.tokenizer: BaseTokenizer = HuggingFaceTokenizer(
                # 从本地路径加载模型,而不是从网络
                tokenizer=AutoTokenizer.from_pretrained(embed_model_path, local_files_only=True),
                max_tokens=max_tokens,
        )
        
        self.chunker = HybridChunker(
                …

Replies: 1 comment 1 reply

Comment options

You must be logged in to vote
1 reply
@jsxyhelu
Comment options

Answer selected by jsxyhelu
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Category
Q&A
Labels
None yet
2 participants