66 @date:2023/10/20 14:01
77 @desc:
88"""
9- import datetime
109import logging
1110import os
1211import threading
13- import time
1412import traceback
1513from typing import List
1614
1715import django .db .models
18- from django .db import models , transaction
1916from django .db .models import QuerySet
2017from django .db .models .functions import Substr , Reverse
2118from langchain_core .embeddings import Embeddings
2219
2320from common .config .embedding_config import VectorStore
2421from common .db .search import native_search , get_dynamics_model , native_update
25- from common .db .sql_execute import sql_execute , update_execute
2622from common .util .file_util import get_file_content
2723from common .util .lock import try_lock , un_lock
28- from common .util .page_utils import page
24+ from common .util .page_utils import page_desc
2925from dataset .models import Paragraph , Status , Document , ProblemParagraphMapping , TaskType , State
3026from embedding .models import SourceType , SearchMode
3127from smartdoc .conf import PROJECT_DIR
@@ -162,7 +158,7 @@ def embedding_paragraph_apply(paragraph_list):
162158 if is_the_task_interrupted ():
163159 break
164160 ListenerManagement .embedding_by_paragraph (str (paragraph .get ('id' )), embedding_model )
165- post_apply ()
161+ post_apply ()
166162
167163 return embedding_paragraph_apply
168164
@@ -241,13 +237,16 @@ def update_status(query_set: QuerySet, taskType: TaskType, state: State):
241237 lock .release ()
242238
243239 @staticmethod
244- def embedding_by_document (document_id , embedding_model : Embeddings ):
240+ def embedding_by_document (document_id , embedding_model : Embeddings , state_list = None ):
245241 """
246242 向量化文档
243+ @param state_list:
247244 @param document_id: 文档id
248245 @param embedding_model 向量模型
249246 :return: None
250247 """
248+ if state_list is None :
249+ state_list = [State .PENDING , State .SUCCESS , State .FAILURE , State .REVOKE , State .REVOKED ]
251250 if not try_lock ('embedding' + str (document_id )):
252251 return
253252 try :
@@ -268,11 +267,17 @@ def is_the_task_interrupted():
268267 VectorStore .get_embedding_vector ().delete_by_document_id (document_id )
269268
270269 # 根据段落进行向量化处理
271- page (QuerySet (Paragraph ).filter (document_id = document_id ).values ('id' ), 5 ,
272- ListenerManagement .get_embedding_paragraph_apply (embedding_model , is_the_task_interrupted ,
273- ListenerManagement .get_aggregation_document_status (
274- document_id )),
275- is_the_task_interrupted )
270+ page_desc (QuerySet (Paragraph )
271+ .annotate (
272+ reversed_status = Reverse ('status' ),
273+ task_type_status = Substr ('reversed_status' , TaskType .EMBEDDING .value ,
274+ 1 ),
275+ ).filter (task_type_status__in = state_list , document_id = document_id )
276+ .values ('id' ), 5 ,
277+ ListenerManagement .get_embedding_paragraph_apply (embedding_model , is_the_task_interrupted ,
278+ ListenerManagement .get_aggregation_document_status (
279+ document_id )),
280+ is_the_task_interrupted )
276281 except Exception as e :
277282 max_kb_error .error (f'向量化文档:{ document_id } 出现错误{ str (e )} { traceback .format_exc ()} ' )
278283 finally :
0 commit comments