未验证 提交 8795e4a0 编写于 作者: W wuzhihua 提交者: GitHub

Merge branch 'master' into mmoe_fix_0917

......@@ -18,6 +18,7 @@ import collections
import os
import csv
import re
import io
import sys
if six.PY2:
reload(sys)
......@@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
......@@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(
with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with io.open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
wf.write(u"{},".format(str(pos_index)))
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
wf.write(u"{} ".format(str(w)))
wf.write(u"\n")
files = os.listdir(test_dir)
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(
with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with io.open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
wf.write(u"{},".format(str(pos_index)))
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
wf.write(u"{} ".format(str(w)))
wf.write(u"\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_text)) + "\n")
with io.open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write(u"{}\n".format(str(len(vocab_text))))
vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_tag)) + "\n")
with io.open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write(u"{}\n".format(str(len(vocab_tag))))
print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,
......
......@@ -29,11 +29,12 @@ dataset:
hyper_parameters:
optimizer:
class: sgd
class: adam
learning_rate: 0.001
strategy: async
trigram_d: 1439
strategy: sync
trigram_d: 2900
neg_num: 1
slice_end: 8
fc_sizes: [300, 300, 128]
fc_acts: ['tanh', 'tanh', 'tanh']
......@@ -44,7 +45,7 @@ runner:
- name: train_runner
class: train
# num of epochs
epochs: 3
epochs: 1
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
......@@ -54,14 +55,14 @@ runner:
save_inference_feed_varnames: ["query", "doc_pos"] # feed vars of save inference
save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 2
print_interval: 10
phases: phase1
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/2" # load model path
init_model_path: "increment/0" # load model path
phases: phase2
# runner will run all the phase in each epoch
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
tar xzf dssm%2Fbq.tar.gz
rm -f dssm%2Fbq.tar.gz
mv bq/train.txt ./raw_data.txt
python3 preprocess.py
mkdir big_train
mv train.txt ./big_train
mkdir big_test
mv test.txt ./big_test
#encoding=utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,14 +12,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#encoding=utf-8
import os
import sys
import jieba
import numpy as np
import random
f = open("./zhidao", "r")
f = open("./raw_data.txt", "r")
lines = f.readlines()
f.close()
......@@ -26,14 +27,15 @@ f.close()
word_dict = {}
for line in lines:
line = line.strip().split("\t")
text = line[0].split(" ") + line[1].split(" ")
text = line[0].strip("") + " " + line[1].strip("")
text = jieba.cut(text)
for word in text:
if word in word_dict:
continue
else:
word_dict[word] = len(word_dict) + 1
f = open("./zhidao", "r")
f = open("./raw_data.txt", "r")
lines = f.readlines()
f.close()
......@@ -57,12 +59,13 @@ for line in lines:
else:
pos_dict[line[0]] = [line[1]]
print("build dict done")
#划分训练集和测试集
query_list = list(pos_dict.keys())
#print(len(query))
random.shuffle(query_list)
train_query = query_list[:90]
test_query = query_list[90:]
#print(len(query_list))
#random.shuffle(query_list)
train_query = query_list[:11600]
test_query = query_list[11600:]
#获得训练集
train_set = []
......@@ -73,6 +76,7 @@ for query in train_query:
for neg in neg_dict[query]:
train_set.append([query, pos, neg])
random.shuffle(train_set)
print("get train_set done")
#获得测试集
test_set = []
......@@ -84,13 +88,14 @@ for query in test_query:
for neg in neg_dict[query]:
test_set.append([query, neg, 0])
random.shuffle(test_set)
print("get test_set done")
#训练集中的query,pos,neg转化为词袋
f = open("train.txt", "w")
for line in train_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
neg = line[2].strip().split(" ")
query = jieba.cut(line[0].strip())
pos = jieba.cut(line[1].strip())
neg = jieba.cut(line[2].strip())
query_token = [0] * (len(word_dict) + 1)
for word in query:
query_token[word_dict[word]] = 1
......@@ -109,8 +114,8 @@ f.close()
f = open("test.txt", "w")
fa = open("label.txt", "w")
for line in test_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
query = jieba.cut(line[0].strip())
pos = jieba.cut(line[1].strip())
label = line[2]
query_token = [0] * (len(word_dict) + 1)
for word in query:
......
......@@ -29,6 +29,7 @@ class Model(ModelBase):
self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
self.learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate")
self.slice_end = envs.get_global_env("hyper_parameters.slice_end")
def input_data(self, is_infer=False, **kwargs):
query = fluid.data(
......@@ -94,7 +95,7 @@ class Model(ModelBase):
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
avg_cost = fluid.layers.mean(x=loss)
self._cost = avg_cost
......
......@@ -4,11 +4,12 @@
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── data_process #数据一键处理脚本
├── __init__.py
├── README.md #文档
├── model.py #模型文件
......@@ -46,13 +47,19 @@ Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示
<p>
## 数据准备
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
BQ是一个智能客服中文问句匹配数据集,该数据集是自动问答系统语料,共有120,000对句子对,并标注了句子对相似度值。数据中存在错别字、语法不规范等问题,但更加贴近工业场景。执行以下命令可以获取上述数据集。
```
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
tar xzf dssm%2Fbq.tar.gz
rm -f dssm%2Fbq.tar.gz
```
数据集样例:
```
请问一天是否都是限定只能转入或转出都是五万。 微众多少可以赎回短期理财 0
微粒咨询电话号码多少 你们的人工客服电话是多少 1
已经在银行换了新预留号码。 我现在换了电话号码,这个需要更换吗 1
每个字段以tab键分隔,第1,2列表示两个文本。第3列表示类别(0或1,0表示两个文本不相似,1表示两个文本相似)。
```
## 运行环境
PaddlePaddle>=1.7.2
......@@ -120,21 +127,24 @@ PaddleRec Finish
2. 在data目录下载并解压数据集,命令如下:
```
cd data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
tar xzf dssm%2Fbq.tar.gz
rm -f dssm%2Fbq.tar.gz
```
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。命令如下:
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为bq的目录。将其中的train.txt文件移动到data目录下,然后可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。生成时间较长,请耐心等待。命令如下:
```
mv data/zhidao ./
rm -rf data
mv bq/train.txt ./raw_data.txt
python3 preprocess.py
rm -f ./train/train.txt
mv train.txt ./train
rm -f ./test/test.txt
mv test.txt test
mkdir big_train
mv train.txt ./big_train
mkdir big_test
mv test.txt ./big_test
cd ..
```
也可以使用我们提供的一键数据处理脚本data_process.sh
```
sh data_process.sh
```
经过预处理的格式:
训练集为三个稀疏的BOW方式的向量:query,pos,neg
测试集为两个稀疏的BOW方式的向量:query,pos
......@@ -144,8 +154,10 @@ label.txt中对应的测试集中的标签
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将dataset_train中的batch_size从8改为128
将文件model.py中的 hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
改为hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[128, 1]).当您需要改变batchsize的时候,end中第一个参数也需要随之变化
将hyper_parameters中的slice_end从8改为128.当您需要改变batchsize的时候,这个参数也需要随之变化
将dataset_train中的data_path改为{workspace}/data/big_train
将dataset_infer中的data_path改为{workspace}/data/big_test
将hyper_parameters中的trigram_d改为5913
5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动transform.py整合数据,最后计算出正逆序指标:
```
......@@ -155,26 +167,14 @@ sh run.sh
输出结果示例:
```
................run.................
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 07:16:04.512531 32200 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 07:16:04.515708 32200 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 07:16:04.518872 32200 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 07:16:04.520995 32200 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
75
pnr: 2.25581395349
query_num: 11
pair_num: 184 184
equal_num: 44
正序率: 0.692857142857
97 43
```
6. 提醒:因为采取较小的数据集进行训练和测试,得到指标的浮动程度会比较大。如果得到的指标不合预期,可以多次执行步骤5,即可获得合理的指标。
8989
pnr:2.75621659307
query_num:1369
pair_num:16240 , 16240
equal_num:77
正序率: 0.733774670544
pos_num: 11860 , neg_num: 4303
```
## 进阶使用
......
......@@ -13,7 +13,7 @@
# limitations under the License.
#!/bin/bash
echo "................run................."
python -m paddlerec.run -m ./config.yaml >result1.txt
python -m paddlerec.run -m ./config.yaml &> result1.txt
grep -i "query_doc_sim" ./result1.txt >./result2.txt
sed '$d' result2.txt >result.txt
rm -f result1.txt
......
......@@ -32,13 +32,13 @@ filename = './result.txt'
sim = []
for line in open(filename):
line = line.strip().split(",")
line[1] = line[1].split(":")
line = line[1][1].strip(" ")
line[3] = line[3].split(":")
line = line[3][1].strip(" ")
line = line.strip("[")
line = line.strip("]")
sim.append(float(line))
filename = './data/test/test.txt'
filename = './data/big_test/test.txt'
f = open(filename, "r")
f.readline()
query = []
......
......@@ -106,7 +106,7 @@ def make_train():
pair_list.append((d1, high_d2, low_d2))
print('Pair Instance Count:', len(pair_list))
f = open("./data/train/train.txt", "w")
f = open("./data/big_train/train.txt", "w")
for batch in range(800):
X1 = np.zeros((batch_size * 2, data1_maxlen), dtype=np.int32)
X2 = np.zeros((batch_size * 2, data2_maxlen), dtype=np.int32)
......@@ -131,7 +131,7 @@ def make_train():
def make_test():
rel = read_relation(filename=os.path.join(Letor07Path,
'relation.test.fold1.txt'))
f = open("./data/test/test.txt", "w")
f = open("./data/big_test/test.txt", "w")
for label, d1, d2 in rel:
X1 = np.zeros(data1_maxlen, dtype=np.int32)
X2 = np.zeros(data2_maxlen, dtype=np.int32)
......
......@@ -3,7 +3,9 @@
echo "...........load data................."
wget --no-check-certificate 'https://paddlerec.bj.bcebos.com/match_pyramid/match_pyramid_data.tar.gz'
mv ./match_pyramid_data.tar.gz ./data
rm -rf ./data/relation.test.fold1.txt ./data/realtion.train.fold1.txt
rm -rf ./data/relation.test.fold1.txt
tar -xvf ./data/match_pyramid_data.tar.gz
mkdir ./data/big_train
mkdir ./data/big_test
echo "...........data process..............."
python ./data/process.py
......@@ -49,8 +49,8 @@ filename = './result.txt'
pred = []
for line in open(filename):
line = line.strip().split(",")
line[1] = line[1].split(":")
line = line[1][1].strip(" ")
line[3] = line[3].split(":")
line = line[3][1].strip(" ")
line = line.strip("[")
line = line.strip("]")
pred.append(float(line))
......
......@@ -56,10 +56,10 @@
4.嵌入层文件:我们将预训练的词向量存储在嵌入文件中。例如:embed_wiki-pdc_d50_norm
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
......@@ -72,7 +72,7 @@ python -m paddlerec.run -m models/match/match-pyramid/config.yaml
## 论文复现
1. 确认您当前所在目录为PaddleRec/models/match/match-pyramid
2. 本文提供了原数据集的下载以及一键生成训练和测试数据的预处理脚本,您可以直接一键运行:bash data_process.sh
执行该脚本,会从国内源的服务器上下载Letor07数据集,删除掉data文件夹中原有的relation.test.fold1.txt和relation.train.fold1.txt,并将完整的数据集解压到data文件夹。随后运行 process.py 将全量训练数据放置于`./data/train`,全量测试数据放置于`./data/test`。并生成用于初始化embedding层的embedding.npy文件
执行该脚本,会从国内源的服务器上下载Letor07数据集,并将完整的数据集解压到data文件夹。随后运行 process.py 将全量训练数据放置于`./data/big_train`,全量测试数据放置于`./data/big_test`。并生成用于初始化embedding层的embedding.npy文件
执行该脚本的理想输出为:
```
bash data_process.sh
......@@ -123,6 +123,8 @@ data/embed_wiki-pdc_d50_norm
3. 打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将dataset_train下的data_path参数改为{workspace}/data/big_train
将dataset_infer下的data_path参数改为{workspace}/data/big_test
4. 随后,您直接一键运行:bash run.sh 即可得到复现的论文效果
执行该脚本后,会执行python -m paddlerec.run -m ./config.yaml 命令开始训练并测试模型,将测试的结果保存到result.txt文件,最后通过执行eval.py进行评估得到数据的map指标
......@@ -131,7 +133,7 @@ data/embed_wiki-pdc_d50_norm
..............test.................
13651
336
('map=', 0.420878322843591)
('map=', 0.3993127885738651)
```
## 进阶使用
......
#!/bin/bash
echo "................run................."
python -m paddlerec.run -m ./config.yaml >result1.txt
grep -i "prediction" ./result1.txt >./result.txt
python -m paddlerec.run -m ./config.yaml &>result1.txt
grep -i "prediction" ./result1.txt >./result2.txt
sed '$d' result2.txt >result.txt
rm -f result2.txt
rm -f result1.txt
python eval.py
......@@ -26,19 +26,19 @@ dataset:
batch_size: 1
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
sparse_slots: "1 2"
sparse_slots: "0 1"
# hyper parameters of user-defined network
hyper_parameters:
optimizer:
class: Adam
learning_rate: 0.0001
strategy: async
learning_rate: 0.001
strategy: sync
query_encoder: "gru"
title_encoder: "gru"
query_encode_dim: 128
title_encode_dim: 128
sparse_feature_dim: 1439
sparse_feature_dim: 6327
embedding_dim: 128
hidden_size: 128
margin: 0.1
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
tar xzf dssm%2Fbq.tar.gz
rm -f dssm%2Fbq.tar.gz
mv bq/train.txt ./raw_data.txt
python3 preprocess.py
mkdir big_train
mv train.txt ./big_train
mkdir big_test
mv test.txt ./big_test
#encoding=utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,14 +12,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#encoding=utf-8
import os
import sys
import jieba
import numpy as np
import random
f = open("./zhidao", "r")
f = open("./raw_data.txt", "r")
lines = f.readlines()
f.close()
......@@ -26,14 +27,15 @@ f.close()
word_dict = {}
for line in lines:
line = line.strip().split("\t")
text = line[0].split(" ") + line[1].split(" ")
text = line[0].strip("") + line[1].strip("")
text = jieba.cut(text)
for word in text:
if word in word_dict:
continue
else:
word_dict[word] = len(word_dict) + 1
f = open("./zhidao", "r")
f = open("./raw_data.txt", "r")
lines = f.readlines()
f.close()
......@@ -59,10 +61,10 @@ for line in lines:
#划分训练集和测试集
query_list = list(pos_dict.keys())
#print(len(query_list))
print(len(query_list))
random.shuffle(query_list)
train_query = query_list[:90]
test_query = query_list[90:]
train_query = query_list[:11600]
test_query = query_list[11600:]
#获得训练集
train_set = []
......@@ -88,9 +90,9 @@ random.shuffle(test_set)
#训练集中的query,pos,neg转化格式
f = open("train.txt", "w")
for line in train_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
neg = line[2].strip().split(" ")
query = jieba.cut(line[0].strip())
pos = jieba.cut(line[1].strip())
neg = jieba.cut(line[2].strip())
query_list = []
for word in query:
query_list.append(word_dict[word])
......@@ -110,8 +112,8 @@ f = open("test.txt", "w")
fa = open("label.txt", "w")
fb = open("testquery.txt", "w")
for line in test_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
query = jieba.cut(line[0].strip())
pos = jieba.cut(line[1].strip())
label = line[2]
query_list = []
for word in query:
......
......@@ -4,11 +4,12 @@
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── data_process.sh #一键数据处理脚本
├── __init__.py
├── README.md #文档
├── model.py #模型文件
......@@ -42,14 +43,20 @@
<p>
## 数据准备
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
BQ是一个智能客服中文问句匹配数据集,该数据集是自动问答系统语料,共有120,000对句子对,并标注了句子对相似度值。数据中存在错别字、语法不规范等问题,但更加贴近工业场景。执行以下命令可以获取上述数据集。
```
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
tar xzf dssm%2Fbq.tar.gz
rm -f dssm%2Fbq.tar.gz
```
数据格式为一个标识句子的slot,后跟一个句子中词的token。两者形成{slot:token}的形式标识一个词:
数据集样例:
```
请问一天是否都是限定只能转入或转出都是五万。 微众多少可以赎回短期理财 0
微粒咨询电话号码多少 你们的人工客服电话是多少 1
已经在银行换了新预留号码。 我现在换了电话号码,这个需要更换吗 1
每个字段以tab键分隔,第1,2列表示两个文本。第3列表示类别(0或1,0表示两个文本不相似,1表示两个文本相似)。
```
最终输出的数据格式为一个标识句子的slot,后跟一个句子中词的token。两者形成{slot:token}的形式标识一个词:
```
0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:1144 1:217 1:206 1:9 1:3 1:207 1:10 1:398 1:2 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:951 1:952 1:206 1:9 1:3 1:207 1:10 1:398 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
......@@ -75,24 +82,29 @@ python -m paddlerec.run -m models/match/multiview-simnet/config.yaml
2. 在data目录下载并解压数据集,命令如下:
```
cd data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm -f simnet_dataset-1.0.0.tar.gz
mv data/zhidao ./
rm -rf data
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
tar xzf dssm%2Fbq.tar.gz
rm -f dssm%2Fbq.tar.gz
mv bq/train.txt ./raw_data.txt
```
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt,label.txt和testquery.txt。将其放入train和test目录下以备训练时调用。命令如下:
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为bq的目录。将其中的train.txt文件移动到data目录下。然后可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt,label.txt和testquery.txt。将其放入train和test目录下以备训练时调用。生成时间较长,请耐心等待。命令如下:
```
python3 preprocess.py
rm -f ./train/train.txt
mv train.txt ./train
rm -f ./test/test.txt
mv test.txt ./test
mkdir big_train
mv train.txt ./big_train
mkdir big_test
mv test.txt ./big_test
cd ..
```
4. 退回tagspace目录中,打开文件config.yaml,更改其中的参数
也可以使用我们提供的一键数据处理脚本data_process.sh
```
sh data_process.sh
```
4. 退回multiview-simnet目录中,打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将dataset_train中的data_path改为{workspace}/data/big_train
将dataset_infer中的data_path改为{workspace}/data/big_test
5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动格式整理程序transform,最后计算正逆序比:
```
......@@ -102,26 +114,14 @@ sh run.sh
运行结果大致如下:
```
................run.................
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 14:24:57.255358 7888 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 14:24:57.259166 7888 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 14:24:57.262634 7888 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 14:24:57.264791 7888 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
103
pnr: 1.17674418605
query_num: 11
pair_num: 468 468
8902
pnr: 13.6785350966
query_num: 1371
pair_num: 14429 14429
equal_num: 0
正序率: 0.540598290598
253 215
正序率: 0.931873310694
13446 983
```
6. 提醒:因为采取较小的数据集进行训练和测试,得到指标的浮动程度会比较大。如果得到的指标不合预期,可以多次执行步骤5,即可获得合理的指标。
## 进阶使用
## FAQ
......@@ -14,7 +14,7 @@
#!/bin/bash
echo "................run................."
python -m paddlerec.run -m ./config.yaml >result1.txt
python -m paddlerec.run -m ./config.yaml &>result1.txt
grep -i "query_pt_sim" ./result1.txt >./result2.txt
sed '$d' result2.txt >result.txt
rm -f result1.txt
......
......@@ -31,8 +31,9 @@ filename = './result.txt'
sim = []
for line in open(filename):
line = line.strip().split(",")
line[1] = line[1].split(":")
line = line[1][1].strip(" ")
print(line)
line[3] = line[3].split(":")
line = line[3][1].strip(" ")
line = line.strip("[")
line = line.strip("]")
sim.append(float(line))
......@@ -49,5 +50,6 @@ f.close()
filename = 'pair.txt'
f = open(filename, "w")
for i in range(len(sim)):
print(i)
f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
f.close()
......@@ -13,6 +13,7 @@ cd paddle-rec
python -m paddlerec.run -m models/treebased/tdm/config.yaml
```
3. 建树及自定义训练的细节可以查阅[TDM-Demo建树及训练](./gen_tree/README.md)
## 树结构的准备
### 名词概念
......
wget https://paddlerec.bj.bcebos.com/utils/tree_build_utils.tar.gz --no-check-certificate
# input_path: embedding的路径
# emb_shape: embedding中key-value,value的维度
# emb格式要求: embedding_id(int64),embedding(float),embedding(float),......,embedding(float)
# cluster_threads: 建树聚类所用线程
python_172_anytree/bin/python -u main.py --input_path=./gen_emb/item_emb.txt --output_path=./ --emb_shape=24 --cluster_threads=4
建树流程是:1、读取emb -> 2、kmeans聚类 -> 3、聚类结果整理为树 -> 4、基于树结构得到模型所需的4个文件
1 Layer_list:记录了每一层都有哪些节点。训练用
2 Travel_list:记录每个叶子节点的Travel路径。训练用
3 Tree_Info:记录了每个节点的信息,主要为:是否是item/item_id,所在层级,父节点,子节点。检索用
4 Tree_Embedding:记录所有节点的Embedding。训练及检索用
注意一下训练数据输入的item是建树之前用的item id,还是基于树的node id,还是基于叶子的leaf id,在tdm_reader.py中,可以加载字典,做映射。
用厂内版建树得到的输出文件夹里,有名为id2nodeid.txt的映射文件,格式是『hash值』+ 『树节点ID』+『叶子节点ID(表示第几个叶子节点,tdm_sampler op 所需的输入)』
在另一个id2bidword.txt中,也有映射关系,格式是『hash值』+『原始item ID』,这个文件中仅存储了叶子节点的信息。
......@@ -59,49 +59,39 @@ hyper_parameters:
tree_emb_path: "{workspace}/tree/tree_emb.npy"
# select runner by name
mode: runner1
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
mode: [runner1]
runner:
- name: runner1
class: train
startup_class_path: "{workspace}/tdm_startup.py"
# num of epochs
epochs: 10
# device to run training or infer
device: cpu
save_checkpoint_interval: 2 # save model interval of epochs
save_inference_interval: 4 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 10
phases: [phase1]
- name: runner2
class: infer
startup_class_path: "{workspace}/tdm_startup.py"
# device to run training or infer
device: cpu
init_model_path: "increment/0" # load model path
print_interval: 1
phases: [phase2]
- name: runner3
class: local_cluster_train
startup_class_path: "{workspace}/tdm_startup.py"
fleet_mode: ps
epochs: 10
# device to run training or infer
device: cpu
save_checkpoint_interval: 2 # save model interval of epochs
save_inference_interval: 4 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "init_model" # load model path
print_interval: 10
phases: [phase1]
# runner will run all the phase in each epoch
phase:
......@@ -109,7 +99,7 @@ phase:
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
# - name: phase2
# model: "{workspace}/model.py"
# dataset_name: dataset_infer
# thread_num: 2
- name: phase2
model: "{workspace}/model.py"
dataset_name: dataset_infer
thread_num: 2
# TDM-Demo建树及训练
## 建树所需环境
Requirements:
- python >= 2.7
- paddlepaddle >= 1.7.2(建议1.7.2)
- paddle-rec (克隆github paddlerec,执行python setup.py install)
- sklearn
- anytree
## 建树流程
### 生成建树所需Embedding
- 生成Fake的emb
```shell
cd gen_tree
python -u emb_util.py
```
生成的emb维度是[13, 64],含义是共有13个item,每个item的embedding维度是64,生成的item_emb位于`gen_tree/item_emb.txt`
格式为`emb_value_0(float) 空格 emb_value_1(float) ... emb_value_63(float) \t item_id `
在demo中,要求item的编号从0开始,范围 [0, item_nums-1]
真实场景可以通过各种hash映射满足该要求
### 对Item_embedding进行聚类建树
执行
```shell
cd gen_tree
# emd_path: item_emb的地址
# emb_size: item_emb的第二个维度,即每个item的emb的size(示例中为64)
# threads: 多线程建树配置的线程数
# n_clusters: 最终建树为几叉树,此处设置为2叉树
python gen_tree.py --emd_path item_emb.txt --emb_size 64 --output_dir ./output --threads 1 --n_clusters 2
```
生成的训练所需树结构文件位于`gen_tree/output`
```shell
.
├── id2item.json # 树节点id到item id的映射表
├── layer_list.txt # 树的每个层级都有哪些节点
├── travel_list.npy # 每个item从根到叶子的遍历路径,按item顺序排序
├── travel_list.txt # 上个文件的明文txt
├── tree_embedding.txt # 所有节点按节点id排列组成的embedding
├── tree_emb.npy # 上个文件的.npy版本
├── tree_info.npy # 每个节点:是否对应item/父/层级/子节点,按节点顺序排列
├── tree_info.txt # 上个文件的明文txt
└── tree.pkl # 聚类得到的树结构
```
我们最终需要使用建树生成的以下四个文件,参与网络训练,参考`models/treebased/tdm/config.yaml`
1. layer_list.txt
2. travel_list.npy
3. tree_info.npy
4. tree_emb.npy
### 执行训练
- 更改`config.yaml`中的配置
首先更改
```yaml
hyper_parameters:
# ...
tree:
# 单机训练建议tree只load一次,保存为paddle tensor,之后从paddle模型热启
# 分布式训练trainer需要独立load
# 预测时也改为从paddle模型加载
load_tree_from_numpy: True # only once
load_paddle_model: False # train & infer need, after load from npy, change it to True
tree_layer_path: "{workspace}/tree/layer_list.txt"
tree_travel_path: "{workspace}/tree/travel_list.npy"
tree_info_path: "{workspace}/tree/tree_info.npy"
tree_emb_path: "{workspace}/tree/tree_emb.npy"
```
将上述几个path改为建树得到的文件所在的地址
再更改
```yaml
hyper_parameters:
max_layers: 4 # 不含根节点,树的层数
node_nums: 26 # 树共有多少个节点,数量与tree_info文件的行数相等
leaf_node_nums: 13 # 树共有多少个叶子节点
layer_node_num_list: [2, 4, 8, 10] # 树的每层有多少个节点
child_nums: 2 # 每个节点最多有几个孩子结点(几叉树)
neg_sampling_list: [1, 2, 3, 4] # 在树的每层做多少负采样,训练自定义的参数
```
若并不知道对上面几个参数具体值,可以试运行一下,paddlerec读取建树生成的文件后,会将具体信息打印到屏幕上,如下所示:
```shell
...
File_list: ['models/treebased/tdm/data/train/demo_fake_input.txt']
2020-09-10 15:17:19,259 - INFO - Run TDM Trainer Startup Pass
2020-09-10 15:17:19,283 - INFO - load tree from numpy
2020-09-10 15:17:19,284 - INFO - TDM Tree leaf node nums: 13
2020-09-10 15:17:19,284 - INFO - TDM Tree max layer: 4
2020-09-10 15:17:19,284 - INFO - TDM Tree layer_node_num_list: [2, 4, 8, 10]
2020-09-10 15:17:19,285 - INFO - Begin Save Init model.
2020-09-10 15:17:19,394 - INFO - End Save Init model.
Running SingleRunner.
...
```
将其抄到配置中即可
- 训练
执行
```
cd /PaddleRec # PaddleRec 克隆的根目录
python -m paddlerec.run -m models/treebased/tdm/config.yaml
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import cluster
__all__ = []
__all__ += cluster.__all__
# Copyright (C) 2016-2018 Alibaba Group Holding Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import codecs
import os
import time
import collections
import argparse
import multiprocessing as mp
import numpy as np
from sklearn.cluster import KMeans
import tree_builder
__all__ = ['Cluster']
class Cluster:
def __init__(self,
filename,
emb_size,
id_offset=None,
parall=16,
prev_result=None,
output_dir='./',
_n_clusters=2):
self.filename = filename
self.emb_size = emb_size
self.mini_batch = 256
self.ids = None
self.data = None
self.items = None
self.parall = parall
self.queue = None
self.timeout = 5
self.id_offset = id_offset
self.codes = None
self.prev_result = prev_result
self.output_dir = output_dir
self.n_clusters = _n_clusters
def _read(self):
t1 = time.time()
ids = list()
data = list()
items = list()
count = 0
with codecs.open(self.filename, 'r', encoding='utf-8') as f:
for line in f:
arr = line.rstrip().split('\t')
if not arr:
break
elif len(arr) == 1:
label = arr[0]
emb_vec = (np.random.random_sample(
(self.emb_size, ))).tolist()
elif len(arr) == 2:
label = arr[1]
emb_vec = arr[0].split()
if len(emb_vec) != self.emb_size:
continue
if label in items:
index = items.index(label)
for i in range(0, len(emb_vec)):
data[index][i + 1] += float(emb_vec[i])
data[index][0] += 1
else:
items.append(label)
ids.append(count)
count += 1
vector = list()
vector.append(1)
for i in range(0, len(emb_vec)):
vector.append(float(emb_vec[i]))
data.append(vector)
for i in range(len(data)):
data_len = len(data[0])
for j in range(1, data_len):
data[i][j] /= data[i][0]
data[i] = data[i][1:]
self.ids = np.array(ids)
self.data = np.array(data)
self.items = np.array(items)
t2 = time.time()
print("Read data done, {} records read, elapsed: {}".format(
len(ids), t2 - t1))
def train(self):
''' Cluster data '''
self._read()
queue = mp.Queue()
self.process_prev_result(queue)
processes = []
pipes = []
for _ in range(self.parall):
a, b = mp.Pipe()
p = mp.Process(target=self._train, args=(b, queue))
processes.append(p)
pipes.append(a)
p.start()
self.codes = np.zeros((len(self.ids), ), dtype=np.int64)
for pipe in pipes:
codes = pipe.recv()
for i in range(len(codes)):
if codes[i] > 0:
self.codes[i] = codes[i]
for p in processes:
p.join()
assert (queue.empty())
builder = tree_builder.TreeBuilder(self.output_dir, self.n_clusters)
builder.build(self.ids, self.codes, items=self.items, data=self.data)
def process_prev_result(self, queue):
if not self.prev_result:
queue.put((0, np.array(range(len(self.ids)))))
return True
di = dict()
for i, node_id in enumerate(self.ids):
di[node_id] = i
indexes = []
clusters = []
with open(self.prev_result) as f:
for line in f:
arr = line.split(",")
if arr < 2:
break
ni = [di[int(m)] for m in arr]
clusters.append(ni)
indexes += ni
assert len(set(indexes)) == len(self.ids), \
"ids count: {}, index count: {}".format(len(self.ids),
len(set(indexes)))
count = len(clusters)
assert (count & (count - 1)) == 0, \
"Prev cluster count: {}".format(count)
for i, ni in enumerate(clusters):
queue.put((i + count - 1, np.array(ni)))
return True
def _train(self, pipe, queue):
last_size = -1
catch_time = 0
processed = False
code = np.zeros((len(self.ids), ), dtype=np.int64)
while True:
for _ in range(3):
try:
pcode, index = queue.get(timeout=self.timeout)
except:
index = None
if index is not None:
break
if index is None:
if processed and (last_size <= self.mini_batch or
catch_time >= 3):
print("Process {} exits".format(os.getpid()))
break
else:
print("Got empty job, pid: {}, time: {}".format(os.getpid(
), catch_time))
catch_time += 1
continue
processed = True
catch_time = 0
last_size = len(index)
if last_size <= self.mini_batch:
self._minbatch(pcode, index, code)
else:
start = time.time()
sub_index = self._cluster(index)
if last_size > self.mini_batch:
print("Train iteration done, pcode:{}, "
"data size: {}, elapsed time: {}"
.format(pcode, len(index), time.time() - start))
self.timeout = int(0.4 * self.timeout + 0.6 * (time.time() -
start))
if self.timeout < 5:
self.timeout = 5
for i in range(self.n_clusters):
if len(sub_index[i]) > 1:
queue.put(
(self.n_clusters * pcode + i + 1, sub_index[i]))
process_count = 0
for c in code:
if c > 0:
process_count += 1
print("Process {} process {} items".format(os.getpid(), process_count))
pipe.send(code)
def _minbatch(self, pcode, index, code):
dq = collections.deque()
dq.append((pcode, index))
batch_size = len(index)
tstart = time.time()
while dq:
pcode, index = dq.popleft()
if len(index) <= self.n_clusters:
for i in range(len(index)):
code[index[i]] = self.n_clusters * pcode + i + 1
continue
sub_index = self._cluster(index)
for i in range(self.n_clusters):
if len(sub_index[i]) > 1:
dq.append((self.n_clusters * pcode + i + 1, sub_index[i]))
elif len(sub_index[i]) > 0:
for j in range(len(sub_index[i])):
code[sub_index[i][j]] = self.n_clusters * \
pcode + i + j + 1
print("Minbatch, batch size: {}, elapsed: {}".format(
batch_size, time.time() - tstart))
def _cluster(self, index):
data = self.data[index]
kmeans = KMeans(n_clusters=self.n_clusters, random_state=0).fit(data)
labels = kmeans.labels_
sub_indexes = []
remain_index = []
ave_num = len(index) / self.n_clusters
for i in range(self.n_clusters):
sub_i = np.where(labels == i)[0]
sub_index = index[sub_i]
if len(sub_index) <= ave_num:
sub_indexes.append(sub_index)
else:
distances = kmeans.transform(data[sub_i])[:, i]
sorted_index = sub_index[np.argsort(distances)]
sub_indexes.append(sorted_index[:ave_num])
remain_index.extend(list(sorted_index[ave_num:]))
idx = 0
while idx < self.n_clusters and len(remain_index) > 0:
if len(sub_indexes[idx]) >= ave_num:
idx += 1
else:
diff = min(len(remain_index), ave_num - len(sub_indexes[idx]))
sub_indexes[idx] = np.append(sub_indexes[idx],
np.array(remain_index[0:diff]))
remain_index = remain_index[diff:]
idx += 1
if len(remain_index) > 0:
sub_indexes[0] = np.append(sub_indexes[0], np.array(remain_index))
return sub_indexes
def _cluster1(self, index):
pass
def _rebalance(self, lindex, rindex, distances):
sorted_index = rindex[np.argsort(distances)]
idx = np.concatenate((lindex, sorted_index))
mid = int(len(idx) / 2)
return idx[mid:], idx[:mid]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Tree cluster")
parser.add_argument(
"--embed_file",
required=True,
help="filename of the embedded vector file")
parser.add_argument(
"--emb_size",
type=int,
default=64,
help="dimension of input embedded vector")
parser.add_argument(
"--id_offset",
default=None,
help="id offset of the generated tree internal node")
parser.add_argument(
"--parall",
type=int,
default=16,
help="Parall execution process number")
parser.add_argument(
"--prev_result",
default=None,
help="filename of the previous cluster reuslt")
argments = parser.parse_args()
t1 = time.time()
cluster = Cluster(argments.embed_file, argments.emb_size,
argments.id_offset, argments.parall,
argments.prev_result)
cluster.train()
t2 = time.time()
print("Train complete successfully, elapsed: {}".format(t2 - t1))
# -*- coding=utf8 -*-
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle
import paddle.fluid as fluid
import numpy as np
import json
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
default="create_fake_emb",
choices=["create_fake_emb", "save_item_emb"],
type=str,
help=".")
parser.add_argument("--emb_id_nums", default=13, type=int, help=".")
parser.add_argument("--emb_shape", default=64, type=int, help=".")
parser.add_argument("--emb_path", default='./item_emb.txt', type=str, help='.')
args = parser.parse_args()
def create_fake_emb(emb_id_nums, emb_shape, emb_path):
x = fluid.data(name="item", shape=[1], lod_level=1, dtype="int64")
# use layers.embedding to init emb value
item_emb = fluid.layers.embedding(
input=x,
is_sparse=True,
size=[emb_id_nums, emb_shape],
param_attr=fluid.ParamAttr(
name="Item_Emb",
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=2.0)))
# run startup to init emb tensor
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
# get np.array(emb_tensor)
print("Get Emb")
item_emb_array = np.array(fluid.global_scope().find_var("Item_Emb")
.get_tensor())
with open(emb_path, 'w+') as f:
emb_str = ""
for index, value in enumerate(item_emb_array):
line = []
for v in value:
line.append(str(v))
line_str = " ".join(line)
line_str += "\t"
line_str += str(index)
line_str += "\n"
emb_str += line_str
f.write(emb_str)
print("Item Emb write Finish")
if __name__ == "__main__":
create_fake_emb(args.emb_id_nums, args.emb_shape, args.emb_path)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
from cluster import Cluster
import time
import argparse
from tree_search_util import tree_search_main
parser = argparse.ArgumentParser()
parser.add_argument("--emd_path", default='', type=str, help=".")
parser.add_argument("--emb_size", default=64, type=int, help=".")
parser.add_argument("--threads", default=1, type=int, help=".")
parser.add_argument("--n_clusters", default=3, type=int, help=".")
parser.add_argument("--output_dir", default='', type=str, help='.')
args = parser.parse_args()
def main():
cur_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
if not os.path.exists(args.output_dir):
os.system("mkdir -p " + args.output_dir)
print('%s start build tree' % cur_time)
# 1. Tree clustering, generating two files in current directory, tree.pkl, id2item.json
cluster = Cluster(
args.emd_path,
args.emb_size,
parall=args.threads,
output_dir=args.output_dir,
_n_clusters=args.n_clusters)
cluster.train()
# 2. Tree searching, generating tree_info, travel_list, layer_list for train process.
tree_search_main(
os.path.join(args.output_dir, "tree.pkl"),
os.path.join(args.output_dir, "id2item.json"), args.output_dir,
args.n_clusters)
if __name__ == "__main__":
main()
# Copyright (C) 2016-2018 Alibaba Group Holding Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import sys
import os
import codecs
from tree_impl import _build
_CUR_DIR = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(_CUR_DIR, ".."))
class TreeBuilder:
def __init__(self, output_dir='./', n_clusters=2):
self.output_dir = output_dir
self.n_clusters = n_clusters
def build(
self,
ids,
codes,
data=None,
items=None,
id_offset=None, ):
_build(ids, codes, data, items, self.output_dir, self.n_clusters)
def _ancessors(self, code):
ancs = []
while code > 0:
code = int((code - 1) / 2)
ancs.append(code)
return ancs
# Copyright (C) 2016-2018 Alibaba Group Holding Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from anytree import NodeMixin, RenderTree
import numpy as np
from anytree.exporter.dictexporter import DictExporter
import pickle
import json
import os
import time
class BaseClass(object):
pass
class TDMTreeClass(BaseClass, NodeMixin):
def __init__(self,
key_code,
emb_vec,
ids=None,
text=None,
parent=None,
children=None):
super(TDMTreeClass, self).__init__()
self.key_code = key_code
self.ids = ids
self.emb_vec = emb_vec
self.text = text
self.parent = parent
if children:
self.children = children
def set_parent(self, parent):
self.parent = parent
def set_children(self, children):
self.children = children
def _build(ids, codes, data, items, output_dir, n_clusters=2):
code_list = [0] * 50000000
node_dict = {}
max_code = 0
id2item = {}
curtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
print('%s start gen code_list' % curtime)
for _id, code, datum, item in zip(ids, codes, data, items):
code_list[code] = [datum, _id]
id2item[str(_id)] = item
max_code = max(code, max_code)
ancessors = _ancessors(code, n_clusters)
for ancessor in ancessors:
code_list[ancessor] = [[]]
for code in range(max_code, -1, -1):
if code_list[code] == 0:
continue
if len(code_list[code]) > 1:
pass
elif len(code_list[code]) == 1:
code_list[code][0] = np.mean(code_list[code][0], axis=0)
if code > 0:
ancessor = int((code - 1) / n_clusters)
code_list[ancessor][0].append(code_list[code][0])
print('start gen node_dict')
for code in range(0, max_code + 1):
if code_list[code] == 0:
continue
if len(code_list[code]) > 1:
[datum, _id] = code_list[code]
node_dict[code] = TDMTreeClass(code, emb_vec=datum, ids=_id)
elif len(code_list[code]) == 1:
[datum] = code_list[code]
node_dict[code] = TDMTreeClass(code, emb_vec=datum)
if code > 0:
ancessor = int((code - 1) / n_clusters)
node_dict[code].set_parent(node_dict[ancessor])
save_tree(node_dict[0], os.path.join(output_dir, 'tree.pkl'))
save_dict(id2item, os.path.join(output_dir, 'id2item.json'))
def render(root):
for row in RenderTree(root, childiter=reversed):
print("%s%s" % (row.pre, row.node.text))
def save_tree(root, path):
print('save tree to %s' % path)
exporter = DictExporter()
data = exporter.export(root)
f = open(path, 'wb')
pickle.dump(data, f)
f.close()
def save_dict(dic, filename):
"""save dict into json file"""
print('save dict to %s' % filename)
with open(filename, "w") as json_file:
json.dump(dic, json_file, ensure_ascii=False)
def _ancessors(code, n_clusters):
ancs = []
while code > 0:
code = int((code - 1) / n_clusters)
ancs.append(code)
return ancs
# -*- coding=utf8 -*-
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import pickle
import time
import os
import numpy as np
from anytree import (AsciiStyle, LevelOrderGroupIter, LevelOrderIter, Node,
NodeMixin, RenderTree)
from anytree.importer.dictimporter import DictImporter
from anytree.iterators.abstractiter import AbstractIter
from anytree.walker import Walker
from tree_impl import TDMTreeClass
class myLevelOrderIter(AbstractIter):
@staticmethod
def _iter(children, filter_, stop, maxlevel):
level = 1
while children:
next_children = []
for child in children:
if filter_(child):
yield child, level
next_children += AbstractIter._get_children(child.children,
stop)
children = next_children
level += 1
if AbstractIter._abort_at_level(level, maxlevel):
break
class Tree_search(object):
def __init__(self, tree_path, id2item_path, child_num=2):
self.root = None
self.id2item = None
self.item2id = None
self.child_num = child_num
self.load(tree_path)
# self.load_id2item(id2item_path)
self.level_code = [[]]
self.max_level = 0
self.keycode_id_dict = {}
# embedding
self.keycode_nodeid_dict = {}
self.tree_info = []
self.id_node_dict = {}
self.get_keycode_mapping()
self.travel_tree()
self.get_children()
def get_keycode_mapping(self):
nodeid = 0
self.embedding = []
print("Begin Keycode Mapping")
for node in myLevelOrderIter(self.root):
node, level = node
if level - 1 > self.max_level:
self.max_level = level - 1
self.level_code.append([])
if node.ids is not None:
self.keycode_id_dict[node.key_code] = node.ids
self.id_node_dict[node.ids] = node
self.keycode_nodeid_dict[node.key_code] = nodeid
self.level_code[self.max_level].append(nodeid)
node_infos = []
if node.ids is not None: # item_id
node_infos.append(node.ids)
else:
node_infos.append(0)
node_infos.append(self.max_level) # layer_id
if node.parent: # ancestor_id
node_infos.append(self.keycode_nodeid_dict[
node.parent.key_code])
else:
node_infos.append(0)
self.tree_info.append(node_infos)
self.embedding.append(node.emb_vec)
nodeid += 1
if nodeid % 1000 == 0:
print("travel node id {}".format(nodeid))
def load(self, path):
print("Begin Load Tree")
f = open(path, "rb")
data = pickle.load(f)
pickle.dump(data, open(path, "wb"), protocol=2)
importer = DictImporter()
self.root = importer.import_(data)
f.close()
def load_id2item(self, path):
"""load dict from json file"""
with open(path, "rb") as json_file:
self.id2item = json.load(json_file)
self.item2id = {value: int(key) for key, value in self.id2item.items()}
def get_children(self):
"""get every node children info"""
print("Begin Keycode Mapping")
for node in myLevelOrderIter(self.root):
node, level = node
node_id = self.keycode_nodeid_dict[node.key_code]
child_idx = 0
if node.children:
for child in node.children:
self.tree_info[node_id].append(self.keycode_nodeid_dict[
child.key_code])
child_idx += 1
while child_idx < self.child_num:
self.tree_info[node_id].append(0)
child_idx += 1
if node_id % 1000 == 0:
print("get children node id {}".format(node_id))
def travel_tree(self):
self.travel_list = []
tree_walker = Walker()
print("Begin Travel Tree")
for item in sorted(self.id_node_dict.keys()):
node = self.id_node_dict[int(item)]
paths, _, _ = tree_walker.walk(node, self.root)
paths = list(paths)
paths.reverse()
travel = [self.keycode_nodeid_dict[i.key_code] for i in paths]
while len(travel) < self.max_level:
travel.append(0)
self.travel_list.append(travel)
def tree_search_main(tree_path, id2item_path, output_dir, n_clusters=2):
print("Begin Tree Search")
t = Tree_search(tree_path, id2item_path, n_clusters)
# 1. Walk all leaf nodes, get travel path array
travel_list = np.array(t.travel_list)
np.save(os.path.join(output_dir, "travel_list.npy"), travel_list)
with open(os.path.join(output_dir, "travel_list.txt"), 'w') as fout:
for i, travel in enumerate(t.travel_list):
travel = map(str, travel)
fout.write(','.join(travel))
fout.write("\n")
print("End Save tree travel")
# 2. Walk all layer of tree, get layer array
layer_num = 0
with open(os.path.join(output_dir, "layer_list.txt"), 'w') as fout:
for layer in t.level_code:
# exclude layer 0
if layer_num == 0:
layer_num += 1
continue
for idx in range(len(layer) - 1):
fout.write(str(layer[idx]) + ',')
fout.write(str(layer[-1]) + "\n")
print("Layer {} has {} node, the first {}, the last {}".format(
layer_num, len(layer), layer[0], layer[-1]))
layer_num += 1
print("End Save tree layer")
# 3. Walk all node of tree, get tree info
tree_info = np.array(t.tree_info)
np.save(os.path.join(output_dir, "tree_info.npy"), tree_info)
with open(os.path.join(output_dir, "tree_info.txt"), 'w') as fout:
for i, node_infos in enumerate(t.tree_info):
node_infos = map(str, node_infos)
fout.write(','.join(node_infos))
fout.write("\n")
print("End Save tree info")
# 4. save embedding
embedding = np.array(t.embedding)
np.save(os.path.join(output_dir, "tree_emb.npy"), embedding)
with open(os.path.join(output_dir, "tree_embedding.txt"), "w") as fout:
for i, emb in enumerate(t.embedding):
emb = map(str, emb)
fout.write(','.join(emb))
fout.write("\n")
if __name__ == "__main__":
tree_path = "./tree.pkl"
id2item_path = "./id2item.json"
output_dir = "./output"
if not os.path.exists(output_dir):
os.system("mkdir -p " + output_dir)
tree_search_main(tree_path, id2item_path, output_dir)
#encoding=utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,8 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/python
#-*- coding:utf-8 -*-
"""
docstring
"""
......@@ -21,10 +20,10 @@ import os
import sys
if len(sys.argv) < 2:
print "usage:python %s input" % (sys.argv[0])
print("usage:python {} input".format(sys.argv[0]))
sys.exit(-1)
fin = file(sys.argv[1])
fin = open(sys.argv[1])
pos_num = 0
neg_num = 0
......@@ -42,15 +41,15 @@ for line in fin:
cols = line.strip().split("\t")
cnt += 1
if cnt % 500000 == 0:
print "cnt:", cnt, 1.0 * pos_num / neg_num
print("cnt:{}".format(1.0 * pos_num / neg_num))
if len(cols) != 3:
continue
cur_query = cols[0]
if cur_query != last_query:
query_num += 1
for i in xrange(0, len(score_list)):
for j in xrange(i + 1, len(score_list)):
for i in range(0, len(score_list)):
for j in range(i + 1, len(score_list)):
if label_list[i] == label_list[j]:
continue
pair_num += 1
......@@ -74,8 +73,8 @@ for line in fin:
fin.close()
for i in xrange(0, len(score_list)):
for j in xrange(i + 1, len(score_list)):
for i in range(0, len(score_list)):
for j in range(i + 1, len(score_list)):
if label_list[i] == label_list[j]:
continue
pair_num += 1
......@@ -89,9 +88,9 @@ for i in xrange(0, len(score_list)):
equal_num += 1
if neg_num > 0:
print "pnr:", 1.0 * pos_num / neg_num
print "query_num:", query_num
print "pair_num:", pos_num + neg_num + equal_num, pair_num
print "equal_num:", equal_num
print "正序率:", 1.0 * pos_num / (pos_num + neg_num)
print pos_num, neg_num
print("pnr:{}".format(1.0 * pos_num / neg_num))
print("query_num:{}".format(query_num))
print("pair_num:{} , {}".format(pos_num + neg_num + equal_num, pair_num))
print("equal_num:{}".format(equal_num))
print("正序率: {}".format(1.0 * pos_num / (pos_num + neg_num)))
print("pos_num: {} , neg_num: {}".format(pos_num, neg_num))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册