Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Dify.AI
Dify
提交
724e0537
Dify
项目概览
Dify.AI
/
Dify
8 个月 前同步成功
通知
12
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Dify
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
724e0537
编写于
9月 22, 2023
作者:
J
Jyong
提交者:
GitHub
9月 22, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix/qdrant data issue (#1203)
Co-authored-by:
N
jyong
<
jyong@dify.ai
>
上级
e409895c
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
175 addition
and
153 deletion
+175
-153
api/commands.py
api/commands.py
+80
-79
api/core/index/vector_index/base.py
api/core/index/vector_index/base.py
+5
-3
api/core/index/vector_index/qdrant.py
api/core/index/vector_index/qdrant.py
+67
-64
api/core/index/vector_index/qdrant_vector_index.py
api/core/index/vector_index/qdrant_vector_index.py
+13
-0
api/events/event_handlers/clean_when_dataset_deleted.py
api/events/event_handlers/clean_when_dataset_deleted.py
+2
-1
api/tasks/clean_dataset_task.py
api/tasks/clean_dataset_task.py
+6
-4
api/tasks/deal_dataset_vector_index_task.py
api/tasks/deal_dataset_vector_index_task.py
+2
-2
未找到文件。
api/commands.py
浏览文件 @
724e0537
...
...
@@ -3,12 +3,13 @@ import json
import
math
import
random
import
string
import
threading
import
time
import
uuid
import
click
from
tqdm
import
tqdm
from
flask
import
current_app
from
flask
import
current_app
,
Flask
from
langchain.embeddings
import
OpenAIEmbeddings
from
werkzeug.exceptions
import
NotFound
...
...
@@ -456,92 +457,92 @@ def update_qdrant_indexes():
@
click
.
command
(
'normalization-collections'
,
help
=
'restore all collections in one'
)
def
normalization_collections
():
click
.
echo
(
click
.
style
(
'Start normalization collections.'
,
fg
=
'green'
))
normalization_count
=
0
normalization_count
=
[]
page
=
1
while
True
:
try
:
datasets
=
db
.
session
.
query
(
Dataset
).
filter
(
Dataset
.
indexing_technique
==
'high_quality'
)
\
.
order_by
(
Dataset
.
created_at
.
desc
()).
paginate
(
page
=
page
,
per_page
=
5
0
)
.
order_by
(
Dataset
.
created_at
.
desc
()).
paginate
(
page
=
page
,
per_page
=
10
0
)
except
NotFound
:
break
datasets_result
=
datasets
.
items
page
+=
1
for
dataset
in
datasets
:
if
not
dataset
.
collection_binding_id
:
try
:
click
.
echo
(
'restore dataset index: {}'
.
format
(
dataset
.
id
))
try
:
embedding_model
=
ModelFactory
.
get_embedding_model
(
tenant_id
=
dataset
.
tenant_id
,
model_provider_name
=
dataset
.
embedding_model_provider
,
model_name
=
dataset
.
embedding_model
)
except
Exception
:
provider
=
Provider
(
id
=
'provider_id'
,
tenant_id
=
dataset
.
tenant_id
,
provider_name
=
'openai'
,
provider_type
=
ProviderType
.
CUSTOM
.
value
,
encrypted_config
=
json
.
dumps
({
'openai_api_key'
:
'TEST'
}),
is_valid
=
True
,
)
model_provider
=
OpenAIProvider
(
provider
=
provider
)
embedding_model
=
OpenAIEmbedding
(
name
=
"text-embedding-ada-002"
,
model_provider
=
model_provider
)
embeddings
=
CacheEmbedding
(
embedding_model
)
dataset_collection_binding
=
db
.
session
.
query
(
DatasetCollectionBinding
).
\
filter
(
DatasetCollectionBinding
.
provider_name
==
embedding_model
.
model_provider
.
provider_name
,
DatasetCollectionBinding
.
model_name
==
embedding_model
.
name
).
\
order_by
(
DatasetCollectionBinding
.
created_at
).
\
first
()
if
not
dataset_collection_binding
:
dataset_collection_binding
=
DatasetCollectionBinding
(
provider_name
=
embedding_model
.
model_provider
.
provider_name
,
model_name
=
embedding_model
.
name
,
collection_name
=
"Vector_index_"
+
str
(
uuid
.
uuid4
()).
replace
(
"-"
,
"_"
)
+
'_Node'
)
db
.
session
.
add
(
dataset_collection_binding
)
db
.
session
.
commit
()
for
i
in
range
(
0
,
len
(
datasets_result
),
5
):
threads
=
[]
sub_datasets
=
datasets_result
[
i
:
i
+
5
]
for
dataset
in
sub_datasets
:
document_format_thread
=
threading
.
Thread
(
target
=
deal_dataset_vector
,
kwargs
=
{
'flask_app'
:
current_app
.
_get_current_object
(),
'dataset'
:
dataset
,
'normalization_count'
:
normalization_count
})
threads
.
append
(
document_format_thread
)
document_format_thread
.
start
()
for
thread
in
threads
:
thread
.
join
()
click
.
echo
(
click
.
style
(
'Congratulations! restore {} dataset indexes.'
.
format
(
len
(
normalization_count
)),
fg
=
'green'
))
def
deal_dataset_vector
(
flask_app
:
Flask
,
dataset
:
Dataset
,
normalization_count
:
list
):
with
flask_app
.
app_context
():
try
:
click
.
echo
(
'restore dataset index: {}'
.
format
(
dataset
.
id
))
try
:
embedding_model
=
ModelFactory
.
get_embedding_model
(
tenant_id
=
dataset
.
tenant_id
,
model_provider_name
=
dataset
.
embedding_model_provider
,
model_name
=
dataset
.
embedding_model
)
except
Exception
:
provider
=
Provider
(
id
=
'provider_id'
,
tenant_id
=
dataset
.
tenant_id
,
provider_name
=
'openai'
,
provider_type
=
ProviderType
.
CUSTOM
.
value
,
encrypted_config
=
json
.
dumps
({
'openai_api_key'
:
'TEST'
}),
is_valid
=
True
,
)
model_provider
=
OpenAIProvider
(
provider
=
provider
)
embedding_model
=
OpenAIEmbedding
(
name
=
"text-embedding-ada-002"
,
model_provider
=
model_provider
)
embeddings
=
CacheEmbedding
(
embedding_model
)
dataset_collection_binding
=
db
.
session
.
query
(
DatasetCollectionBinding
).
\
filter
(
DatasetCollectionBinding
.
provider_name
==
embedding_model
.
model_provider
.
provider_name
,
DatasetCollectionBinding
.
model_name
==
embedding_model
.
name
).
\
order_by
(
DatasetCollectionBinding
.
created_at
).
\
first
()
if
not
dataset_collection_binding
:
dataset_collection_binding
=
DatasetCollectionBinding
(
provider_name
=
embedding_model
.
model_provider
.
provider_name
,
model_name
=
embedding_model
.
name
,
collection_name
=
"Vector_index_"
+
str
(
uuid
.
uuid4
()).
replace
(
"-"
,
"_"
)
+
'_Node'
)
db
.
session
.
add
(
dataset_collection_binding
)
db
.
session
.
commit
()
from
core.index.vector_index.qdrant_vector_index
import
QdrantVectorIndex
,
QdrantConfig
index
=
QdrantVectorIndex
(
dataset
=
dataset
,
config
=
QdrantConfig
(
endpoint
=
current_app
.
config
.
get
(
'QDRANT_URL'
),
api_key
=
current_app
.
config
.
get
(
'QDRANT_API_KEY'
),
root_path
=
current_app
.
root_path
),
embeddings
=
embeddings
)
if
index
:
index
.
restore_dataset_in_one
(
dataset
,
dataset_collection_binding
)
else
:
click
.
echo
(
'passed.'
)
original_index
=
QdrantVectorIndex
(
dataset
=
dataset
,
config
=
QdrantConfig
(
endpoint
=
current_app
.
config
.
get
(
'QDRANT_URL'
),
api_key
=
current_app
.
config
.
get
(
'QDRANT_API_KEY'
),
root_path
=
current_app
.
root_path
),
embeddings
=
embeddings
)
if
original_index
:
original_index
.
delete_original_collection
(
dataset
,
dataset_collection_binding
)
normalization_count
+=
1
else
:
click
.
echo
(
'passed.'
)
except
Exception
as
e
:
click
.
echo
(
click
.
style
(
'Create dataset index error: {} {}'
.
format
(
e
.
__class__
.
__name__
,
str
(
e
)),
fg
=
'red'
))
continue
click
.
echo
(
click
.
style
(
'Congratulations! restore {} dataset indexes.'
.
format
(
normalization_count
),
fg
=
'green'
))
from
core.index.vector_index.qdrant_vector_index
import
QdrantVectorIndex
,
QdrantConfig
index
=
QdrantVectorIndex
(
dataset
=
dataset
,
config
=
QdrantConfig
(
endpoint
=
current_app
.
config
.
get
(
'QDRANT_URL'
),
api_key
=
current_app
.
config
.
get
(
'QDRANT_API_KEY'
),
root_path
=
current_app
.
root_path
),
embeddings
=
embeddings
)
if
index
:
# index.delete_by_group_id(dataset.id)
index
.
restore_dataset_in_one
(
dataset
,
dataset_collection_binding
)
else
:
click
.
echo
(
'passed.'
)
normalization_count
.
append
(
1
)
except
Exception
as
e
:
click
.
echo
(
click
.
style
(
'Create dataset index error: {} {}'
.
format
(
e
.
__class__
.
__name__
,
str
(
e
)),
fg
=
'red'
))
@
click
.
command
(
'update_app_model_configs'
,
help
=
'Migrate data to support paragraph variable.'
)
...
...
api/core/index/vector_index/base.py
浏览文件 @
724e0537
...
...
@@ -113,8 +113,10 @@ class BaseVectorIndex(BaseIndex):
def
delete_by_group_id
(
self
,
group_id
:
str
)
->
None
:
vector_store
=
self
.
_get_vector_store
()
vector_store
=
cast
(
self
.
_get_vector_store_class
(),
vector_store
)
vector_store
.
delete
()
if
self
.
dataset
.
collection_binding_id
:
vector_store
.
delete_by_group_id
(
group_id
)
else
:
vector_store
.
delete
()
def
delete
(
self
)
->
None
:
vector_store
=
self
.
_get_vector_store
()
...
...
@@ -283,7 +285,7 @@ class BaseVectorIndex(BaseIndex):
if
documents
:
try
:
self
.
create_with_collection_name
(
documents
,
dataset_collection_binding
.
collection_name
)
self
.
add_texts
(
documents
)
except
Exception
as
e
:
raise
e
...
...
api/core/index/vector_index/qdrant.py
浏览文件 @
724e0537
...
...
@@ -1390,70 +1390,12 @@ class Qdrant(VectorStore):
path
=
path
,
**
kwargs
,
)
try
:
# Skip any validation in case of forced collection recreate.
if
force_recreate
:
raise
ValueError
# Get the vector configuration of the existing collection and vector, if it
# was specified. If the old configuration does not match the current one,
# an exception is being thrown.
collection_info
=
client
.
get_collection
(
collection_name
=
collection_name
)
current_vector_config
=
collection_info
.
config
.
params
.
vectors
if
isinstance
(
current_vector_config
,
dict
)
and
vector_name
is
not
None
:
if
vector_name
not
in
current_vector_config
:
raise
QdrantException
(
f
"Existing Qdrant collection
{
collection_name
}
does not "
f
"contain vector named
{
vector_name
}
. Did you mean one of the "
f
"existing vectors:
{
', '
.
join
(
current_vector_config
.
keys
())
}
? "
f
"If you want to recreate the collection, set `force_recreate` "
f
"parameter to `True`."
)
current_vector_config
=
current_vector_config
.
get
(
vector_name
)
# type: ignore[assignment]
elif
isinstance
(
current_vector_config
,
dict
)
and
vector_name
is
None
:
raise
QdrantException
(
f
"Existing Qdrant collection
{
collection_name
}
uses named vectors. "
f
"If you want to reuse it, please set `vector_name` to any of the "
f
"existing named vectors: "
f
"
{
', '
.
join
(
current_vector_config
.
keys
())
}
."
# noqa
f
"If you want to recreate the collection, set `force_recreate` "
f
"parameter to `True`."
)
elif
(
not
isinstance
(
current_vector_config
,
dict
)
and
vector_name
is
not
None
):
raise
QdrantException
(
f
"Existing Qdrant collection
{
collection_name
}
doesn't use named "
f
"vectors. If you want to reuse it, please set `vector_name` to "
f
"`None`. If you want to recreate the collection, set "
f
"`force_recreate` parameter to `True`."
)
# Check if the vector configuration has the same dimensionality.
if
current_vector_config
.
size
!=
vector_size
:
# type: ignore[union-attr]
raise
QdrantException
(
f
"Existing Qdrant collection is configured for vectors with "
f
"
{
current_vector_config
.
size
}
"
# type: ignore[union-attr]
f
"dimensions. Selected embeddings are
{
vector_size
}
-dimensional. "
f
"If you want to recreate the collection, set `force_recreate` "
f
"parameter to `True`."
)
current_distance_func
=
(
current_vector_config
.
distance
.
name
.
upper
()
# type: ignore[union-attr]
)
if
current_distance_func
!=
distance_func
:
raise
QdrantException
(
f
"Existing Qdrant collection is configured for "
f
"
{
current_vector_config
.
distance
}
"
# type: ignore[union-attr]
f
"similarity. Please set `distance_func` parameter to "
f
"`
{
distance_func
}
` if you want to reuse it. If you want to "
f
"recreate the collection, set `force_recreate` parameter to "
f
"`True`."
)
except
(
UnexpectedResponse
,
RpcError
,
ValueError
):
all_collection_name
=
[]
collections_response
=
client
.
get_collections
()
collection_list
=
collections_response
.
collections
for
collection
in
collection_list
:
all_collection_name
.
append
(
collection
.
name
)
if
collection_name
not
in
all_collection_name
:
vectors_config
=
rest
.
VectorParams
(
size
=
vector_size
,
distance
=
rest
.
Distance
[
distance_func
],
...
...
@@ -1481,6 +1423,67 @@ class Qdrant(VectorStore):
timeout
=
timeout
,
# type: ignore[arg-type]
)
is_new_collection
=
True
if
force_recreate
:
raise
ValueError
# Get the vector configuration of the existing collection and vector, if it
# was specified. If the old configuration does not match the current one,
# an exception is being thrown.
collection_info
=
client
.
get_collection
(
collection_name
=
collection_name
)
current_vector_config
=
collection_info
.
config
.
params
.
vectors
if
isinstance
(
current_vector_config
,
dict
)
and
vector_name
is
not
None
:
if
vector_name
not
in
current_vector_config
:
raise
QdrantException
(
f
"Existing Qdrant collection
{
collection_name
}
does not "
f
"contain vector named
{
vector_name
}
. Did you mean one of the "
f
"existing vectors:
{
', '
.
join
(
current_vector_config
.
keys
())
}
? "
f
"If you want to recreate the collection, set `force_recreate` "
f
"parameter to `True`."
)
current_vector_config
=
current_vector_config
.
get
(
vector_name
)
# type: ignore[assignment]
elif
isinstance
(
current_vector_config
,
dict
)
and
vector_name
is
None
:
raise
QdrantException
(
f
"Existing Qdrant collection
{
collection_name
}
uses named vectors. "
f
"If you want to reuse it, please set `vector_name` to any of the "
f
"existing named vectors: "
f
"
{
', '
.
join
(
current_vector_config
.
keys
())
}
."
# noqa
f
"If you want to recreate the collection, set `force_recreate` "
f
"parameter to `True`."
)
elif
(
not
isinstance
(
current_vector_config
,
dict
)
and
vector_name
is
not
None
):
raise
QdrantException
(
f
"Existing Qdrant collection
{
collection_name
}
doesn't use named "
f
"vectors. If you want to reuse it, please set `vector_name` to "
f
"`None`. If you want to recreate the collection, set "
f
"`force_recreate` parameter to `True`."
)
# Check if the vector configuration has the same dimensionality.
if
current_vector_config
.
size
!=
vector_size
:
# type: ignore[union-attr]
raise
QdrantException
(
f
"Existing Qdrant collection is configured for vectors with "
f
"
{
current_vector_config
.
size
}
"
# type: ignore[union-attr]
f
"dimensions. Selected embeddings are
{
vector_size
}
-dimensional. "
f
"If you want to recreate the collection, set `force_recreate` "
f
"parameter to `True`."
)
current_distance_func
=
(
current_vector_config
.
distance
.
name
.
upper
()
# type: ignore[union-attr]
)
if
current_distance_func
!=
distance_func
:
raise
QdrantException
(
f
"Existing Qdrant collection is configured for "
f
"
{
current_vector_config
.
distance
}
"
# type: ignore[union-attr]
f
"similarity. Please set `distance_func` parameter to "
f
"`
{
distance_func
}
` if you want to reuse it. If you want to "
f
"recreate the collection, set `force_recreate` parameter to "
f
"`True`."
)
qdrant
=
cls
(
client
=
client
,
collection_name
=
collection_name
,
...
...
api/core/index/vector_index/qdrant_vector_index.py
浏览文件 @
724e0537
...
...
@@ -169,6 +169,19 @@ class QdrantVectorIndex(BaseVectorIndex):
],
))
def
delete
(
self
)
->
None
:
vector_store
=
self
.
_get_vector_store
()
vector_store
=
cast
(
self
.
_get_vector_store_class
(),
vector_store
)
from
qdrant_client.http
import
models
vector_store
.
del_texts
(
models
.
Filter
(
must
=
[
models
.
FieldCondition
(
key
=
"group_id"
,
match
=
models
.
MatchValue
(
value
=
self
.
dataset
.
id
),
),
],
))
def
_is_origin
(
self
):
if
self
.
dataset
.
index_struct_dict
:
...
...
api/events/event_handlers/clean_when_dataset_deleted.py
浏览文件 @
724e0537
...
...
@@ -5,4 +5,5 @@ from tasks.clean_dataset_task import clean_dataset_task
@
dataset_was_deleted
.
connect
def
handle
(
sender
,
**
kwargs
):
dataset
=
sender
clean_dataset_task
.
delay
(
dataset
.
id
,
dataset
.
tenant_id
,
dataset
.
indexing_technique
,
dataset
.
index_struct
)
clean_dataset_task
.
delay
(
dataset
.
id
,
dataset
.
tenant_id
,
dataset
.
indexing_technique
,
dataset
.
index_struct
,
dataset
.
collection_binding_id
)
api/tasks/clean_dataset_task.py
浏览文件 @
724e0537
...
...
@@ -13,13 +13,15 @@ from models.dataset import DocumentSegment, Dataset, DatasetKeywordTable, Datase
@
shared_task
(
queue
=
'dataset'
)
def
clean_dataset_task
(
dataset_id
:
str
,
tenant_id
:
str
,
indexing_technique
:
str
,
index_struct
:
str
):
def
clean_dataset_task
(
dataset_id
:
str
,
tenant_id
:
str
,
indexing_technique
:
str
,
index_struct
:
str
,
collection_binding_id
:
str
):
"""
Clean dataset when dataset deleted.
:param dataset_id: dataset id
:param tenant_id: tenant id
:param indexing_technique: indexing technique
:param index_struct: index struct dict
:param collection_binding_id: collection binding id
Usage: clean_dataset_task.delay(dataset_id, tenant_id, indexing_technique, index_struct)
"""
...
...
@@ -31,9 +33,9 @@ def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
id
=
dataset_id
,
tenant_id
=
tenant_id
,
indexing_technique
=
indexing_technique
,
index_struct
=
index_struct
index_struct
=
index_struct
,
collection_binding_id
=
collection_binding_id
)
documents
=
db
.
session
.
query
(
Document
).
filter
(
Document
.
dataset_id
==
dataset_id
).
all
()
segments
=
db
.
session
.
query
(
DocumentSegment
).
filter
(
DocumentSegment
.
dataset_id
==
dataset_id
).
all
()
...
...
@@ -43,7 +45,7 @@ def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
if
dataset
.
indexing_technique
==
'high_quality'
:
vector_index
=
IndexBuilder
.
get_default_high_quality_index
(
dataset
)
try
:
vector_index
.
delete
(
)
vector_index
.
delete
_by_group_id
(
dataset
.
id
)
except
Exception
:
logging
.
exception
(
"Delete doc index failed when dataset deleted."
)
...
...
api/tasks/deal_dataset_vector_index_task.py
浏览文件 @
724e0537
...
...
@@ -31,8 +31,8 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str):
raise
Exception
(
'Dataset not found'
)
if
action
==
"remove"
:
index
=
IndexBuilder
.
get_index
(
dataset
,
'high_quality'
,
ignore_high_quality_check
=
Fals
e
)
index
.
delete
(
)
index
=
IndexBuilder
.
get_index
(
dataset
,
'high_quality'
,
ignore_high_quality_check
=
Tru
e
)
index
.
delete
_by_group_id
(
dataset
.
id
)
elif
action
==
"add"
:
dataset_documents
=
db
.
session
.
query
(
DatasetDocument
).
filter
(
DatasetDocument
.
dataset_id
==
dataset_id
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录