Skip to content

Commit 1721344

Browse files
committed
fix: 修复索引中的文档,知识库删除后依然再执行 (#934)
(cherry picked from commit 864bca6)
1 parent 1117814 commit 1721344

File tree

3 files changed

+26
-9
lines changed

3 files changed

+26
-9
lines changed

apps/common/event/listener_manage.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,16 @@ def embedding_by_paragraph_list(paragraph_id_list, embedding_model: Embeddings):
110110
@embedding_poxy
111111
def embedding_by_paragraph_data_list(data_list, paragraph_id_list, embedding_model: Embeddings):
112112
max_kb.info(f'开始--->向量化段落:{paragraph_id_list}')
113+
status = Status.success
113114
try:
114115
# 删除段落
115116
VectorStore.get_embedding_vector().delete_by_paragraph_ids(paragraph_id_list)
117+
118+
def is_save_function():
119+
return QuerySet(Paragraph).filter(id__in=paragraph_id_list).exists()
120+
116121
# 批量向量化
117-
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model)
122+
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_save_function)
118123
except Exception as e:
119124
max_kb_error.error(f'向量化段落:{paragraph_id_list}出现错误{str(e)}{traceback.format_exc()}')
120125
status = Status.error
@@ -141,8 +146,12 @@ def embedding_by_paragraph(paragraph_id, embedding_model: Embeddings):
141146
os.path.join(PROJECT_DIR, "apps", "common", 'sql', 'list_embedding_text.sql')))
142147
# 删除段落
143148
VectorStore.get_embedding_vector().delete_by_paragraph_id(paragraph_id)
149+
150+
def is_save_function():
151+
return QuerySet(Paragraph).filter(id=paragraph_id).exists()
152+
144153
# 批量向量化
145-
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model)
154+
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_save_function)
146155
except Exception as e:
147156
max_kb_error.error(f'向量化段落:{paragraph_id}出现错误{str(e)}{traceback.format_exc()}')
148157
status = Status.error
@@ -175,8 +184,12 @@ def embedding_by_document(document_id, embedding_model: Embeddings):
175184
os.path.join(PROJECT_DIR, "apps", "common", 'sql', 'list_embedding_text.sql')))
176185
# 删除文档向量数据
177186
VectorStore.get_embedding_vector().delete_by_document_id(document_id)
187+
188+
def is_save_function():
189+
return QuerySet(Document).filter(id=document_id).exists()
190+
178191
# 批量向量化
179-
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model)
192+
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_save_function)
180193
except Exception as e:
181194
max_kb_error.error(f'向量化文档:{document_id}出现错误{str(e)}{traceback.format_exc()}')
182195
status = Status.error

apps/embedding/vector/base_vector.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ def save(self, text, source_type: SourceType, dataset_id: str, document_id: str,
8484
chunk_list = chunk_data(data)
8585
result = sub_array(chunk_list)
8686
for child_array in result:
87-
self._batch_save(child_array, embedding)
87+
self._batch_save(child_array, embedding, lambda: True)
8888

89-
def batch_save(self, data_list: List[Dict], embedding: Embeddings):
89+
def batch_save(self, data_list: List[Dict], embedding: Embeddings, is_save_function):
9090
# 获取锁
9191
lock.acquire()
9292
try:
@@ -100,7 +100,10 @@ def batch_save(self, data_list: List[Dict], embedding: Embeddings):
100100
chunk_list = chunk_data_list(data_list)
101101
result = sub_array(chunk_list)
102102
for child_array in result:
103-
self._batch_save(child_array, embedding)
103+
if is_save_function():
104+
self._batch_save(child_array, embedding, is_save_function)
105+
else:
106+
break
104107
finally:
105108
# 释放锁
106109
lock.release()
@@ -113,7 +116,7 @@ def _save(self, text, source_type: SourceType, dataset_id: str, document_id: str
113116
pass
114117

115118
@abstractmethod
116-
def _batch_save(self, text_list: List[Dict], embedding: Embeddings):
119+
def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_save_function):
117120
pass
118121

119122
def search(self, query_text, dataset_id_list: list[str], exclude_document_id_list: list[str],

apps/embedding/vector/pg_vector.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def _save(self, text, source_type: SourceType, dataset_id: str, document_id: str
5555
embedding.save()
5656
return True
5757

58-
def _batch_save(self, text_list: List[Dict], embedding: Embeddings):
58+
def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_save_function):
5959
texts = [row.get('text') for row in text_list]
6060
embeddings = embedding.embed_documents(texts)
6161
embedding_list = [Embedding(id=uuid.uuid1(),
@@ -68,7 +68,8 @@ def _batch_save(self, text_list: List[Dict], embedding: Embeddings):
6868
embedding=embeddings[index],
6969
search_vector=to_ts_vector(text_list[index]['text'])) for index in
7070
range(0, len(text_list))]
71-
QuerySet(Embedding).bulk_create(embedding_list) if len(embedding_list) > 0 else None
71+
if is_save_function():
72+
QuerySet(Embedding).bulk_create(embedding_list) if len(embedding_list) > 0 else None
7273
return True
7374

7475
def hit_test(self, query_text, dataset_id_list: list[str], exclude_document_id_list: list[str], top_number: int,

0 commit comments

Comments
 (0)