add model multiview-simnet

05fe3b93 · yinhaofeng · d4a280b5 · 05fe3b93 · 05fe3b93 · 05fe3b93
10 changed file
--- a/models/match/multiview-simnet/config.yaml
+++ b/models/match/multiview-simnet/config.yaml
@@ -18,12 +18,12 @@ workspace: "models/match/multiview-simnet"
 # list of dataset
 dataset:
 - name: dataset_train # name of dataset to distinguish different datasets
-  batch_size: 2
+  batch_size: 128
  type: DataLoader # or QueueDataset
  data_path: "{workspace}/data/train"
-  sparse_slots: "1 2 3"
+  sparse_slots: "0 1 2"
 - name: dataset_infer # name
-  batch_size: 2
+  batch_size: 1
  type: DataLoader # or QueueDataset
  data_path: "{workspace}/data/test"
  sparse_slots: "1 2"
@@ -34,24 +34,24 @@ hyper_parameters:
    class: Adam
    learning_rate: 0.0001
    strategy: async
-  query_encoder: "bow"
-  title_encoder: "bow"
+  query_encoder: "gru"
+  title_encoder: "gru"
  query_encode_dim: 128
  title_encode_dim: 128
-  sparse_feature_dim: 1000001
+  sparse_feature_dim: 1439
  embedding_dim: 128
  hidden_size: 128
  margin: 0.1

 # select runner by name
-mode: train_runner
+mode: [train_runner,infer_runner]
 # config of each runner.
 # runner is a kind of paddle training class, which wraps the train/infer process.
 runner:
 - name: train_runner
  class: train
  # num of epochs
-  epochs: 2
+  epochs: 3
  # device to run training or infer
  device: cpu
  save_checkpoint_interval: 1 # save model interval of epochs
@@ -62,12 +62,14 @@ runner:
  save_inference_fetch_varnames: [] # fetch vars of save inference
  init_model_path: "" # load model path
  print_interval: 1
+  phases: phase1
 - name: infer_runner
  class: infer
  # device to run training or infer
  device: cpu
  print_interval: 1
-  init_model_path: "increment/0" # load model path
+  init_model_path: "increment/2" # load model path
+  phases: phase2

 # runner will run all the phase in each epoch
 phase:
@@ -75,7 +77,7 @@ phase:
  model: "{workspace}/model.py" # user-defined model
  dataset_name: dataset_train # select dataset by name
  thread_num: 1
-#- name: phase2
-#  model: "{workspace}/model.py" # user-defined model
-#  dataset_name: dataset_infer # select dataset by name
-#  thread_num: 1
+- name: phase2
+  model: "{workspace}/model.py" # user-defined model
+  dataset_name: dataset_infer # select dataset by name
+  thread_num: 1
--- a/models/match/multiview-simnet/data/preprocess.py
+++ b/models/match/multiview-simnet/data/preprocess.py
+#encoding=utf-8
+
+import os
+import sys
+import numpy as np
+import random
+
+f = open("./zhidao", "r")
+lines = f.readlines()
+f.close()
+
+#建立字典
+word_dict = {}
+for line in lines:
+    line = line.strip().split("\t")
+    text = line[0].split(" ") + line[1].split(" ")
+    for word in text:
+        if word in word_dict:
+            word_dict[word] = word_dict[word] + 1
+        else:
+            word_dict[word] = 1
+
+word_list = word_dict.items()
+word_list = sorted(word_dict.items(), key=lambda item: item[1], reverse=True)
+word_list_ids = range(1, len(word_list) + 1)
+word_dict = dict(zip([x[0] for x in word_list], word_list_ids))
+
+f = open("./zhidao", "r")
+lines = f.readlines()
+f.close()
+
+#划分训练集和测试集
+lines = [line.strip().split("\t") for line in lines]
+random.shuffle(lines)
+train_set = lines[:900]
+test_set = lines[900:]
+
+#建立以query为key，以负例为value的字典
+neg_dict = {}
+for line in train_set:
+    if line[2] == "0":
+        if line[0] in neg_dict:
+            neg_dict[line[0]].append(line[1])
+        else:
+            neg_dict[line[0]] = [line[1]]
+
+#建立以query为key，以正例为value的字典
+pos_dict = {}
+for line in train_set:
+    if line[2] == "1":
+        if line[0] in pos_dict:
+            pos_dict[line[0]].append(line[1])
+        else:
+            pos_dict[line[0]] = [line[1]]
+
+#训练集整理为query，pos，neg的格式
+f = open("train.txt", "w")
+for query in pos_dict.keys():
+    for pos in pos_dict[query]:
+        if query not in neg_dict:
+            continue
+        for neg in neg_dict[query]:
+            f.write(str(query) + "\t" + str(pos) + "\t" + str(neg) + "\n")
+f.close()
+
+f = open("train.txt", "r")
+lines = f.readlines()
+f.close()
+
+#训练集中的query,pos,neg转化格式
+f = open("train.txt", "w")
+for line in lines:
+    line = line.strip().split("\t")
+    query = line[0].strip().split(" ")
+    pos = line[1].strip().split(" ")
+    neg = line[2].strip().split(" ")
+    query_list = []
+    for word in query:
+        query_list.append(word_dict[word])
+    pos_list = []
+    for word in pos:
+        pos_list.append(word_dict[word])
+    neg_list = []
+    for word in neg:
+        neg_list.append(word_dict[word])
+    f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join([
+        "1:" + str(x) for x in pos_list
+    ]) + " " + ' '.join(["2:" + str(x) for x in neg_list]) + "\n")
+f.close()
+
+#测试集中的query和pos转化格式
+f = open("test.txt", "w")
+fa = open("label.txt", "w")
+for line in test_set:
+    query = line[0].strip().split(" ")
+    pos = line[1].strip().split(" ")
+    label = line[2]
+    query_list = []
+    for word in query:
+        query_list.append(word_dict[word])
+    pos_token = []
+    for word in pos:
+        pos_list.append(word_dict[word])
+    f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join(
+        ["1:" + str(x) for x in pos_list]) + "\n")
+    fa.write(label + "\n")
+
+f.close()
+fa.close()
--- a/models/match/multiview-simnet/data/test/test.txt
+++ b/models/match/multiview-simnet/data/test/test.txt
--- a/models/match/multiview-simnet/data/train/train.txt
+++ b/models/match/multiview-simnet/data/train/train.txt
--- a/models/match/multiview-simnet/data_process.sh
+++ b/models/match/multiview-simnet/data_process.sh
-#! /bin/bash
-
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,11 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import random
+import numpy as np
+import sklearn.metrics

-set -e
-echo "begin to prepare data"
+label = []
+filename = './data/label.txt'
+f = open(filename, "r")
+f.readline()
+num = 0
+for line in f.readlines():
+    num = num + 1
+    line = line.strip()
+    label.append(float(line))
+f.close()
+print(num)

-mkdir -p data/train
-mkdir -p data/test
+filename = './result.txt'
+sim = []
+for line in open(filename):
+    line = line.strip().split(",")
+    line[1] = line[1].split(":")
+    line = line[1][1].strip(" ")
+    line = line.strip("[")
+    line = line.strip("]")
+    sim.append(float(line))

-python generate_synthetic_data.py 
+auc = sklearn.metrics.roc_auc_score(label, sim)
+print("auc = ", auc)
--- a/models/match/multiview-simnet/evaluate_reader.py
+++ b/models/match/multiview-simnet/evaluate_reader.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlerec.core.reader import ReaderBase
-from paddlerec.core.utils import envs
-
-
-class Reader(ReaderBase):
-    def init(self):
-        self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
-                                               None, "train.model")
-        self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
-                                               None, "train.model")
-
-        self.all_slots = []
-        for i in range(self.query_slots):
-            self.all_slots.append(str(i))
-
-        for i in range(self.title_slots):
-            self.all_slots.append(str(i + self.query_slots))
-
-        self._all_slots_dict = dict()
-        for index, slot in enumerate(self.all_slots):
-            self._all_slots_dict[slot] = [False, index]
-
-    def generate_sample(self, line):
-        def data_iter():
-            elements = line.rstrip().split()
-            padding = 0
-            output = [(slot, []) for slot in self.all_slots]
-            for elem in elements:
-                feasign, slot = elem.split(':')
-                if not self._all_slots_dict.has_key(slot):
-                    continue
-                self._all_slots_dict[slot][0] = True
-                index = self._all_slots_dict[slot][1]
-                output[index][1].append(int(feasign))
-            for slot in self._all_slots_dict:
-                visit, index = self._all_slots_dict[slot]
-                if visit:
-                    self._all_slots_dict[slot][0] = False
-                else:
-                    output[index][1].append(padding)
-            yield output
-
-        return data_iter
--- a/models/match/multiview-simnet/generate_synthetic_data.py
+++ b/models/match/multiview-simnet/generate_synthetic_data.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-
-class Dataset:
-    def __init__(self):
-        pass
-
-
-class SyntheticDataset(Dataset):
-    def __init__(self,
-                 sparse_feature_dim,
-                 query_slot_num,
-                 title_slot_num,
-                 dataset_size=10000):
-        # ids are randomly generated
-        self.ids_per_slot = 10
-        self.sparse_feature_dim = sparse_feature_dim
-        self.query_slot_num = query_slot_num
-        self.title_slot_num = title_slot_num
-        self.dataset_size = dataset_size
-
-    def _reader_creator(self, is_train):
-        def generate_ids(num, space):
-            return [random.randint(0, space - 1) for i in range(num)]
-
-        def reader():
-            for i in range(self.dataset_size):
-                query_slots = []
-                pos_title_slots = []
-                neg_title_slots = []
-                for i in range(self.query_slot_num):
-                    qslot = generate_ids(self.ids_per_slot,
-                                         self.sparse_feature_dim)
-                    qslot = [str(fea) + ':' + str(i) for fea in qslot]
-                    query_slots += qslot
-                for i in range(self.title_slot_num):
-                    pt_slot = generate_ids(self.ids_per_slot,
-                                           self.sparse_feature_dim)
-                    pt_slot = [
-                        str(fea) + ':' + str(i + self.query_slot_num)
-                        for fea in pt_slot
-                    ]
-                    pos_title_slots += pt_slot
-                if is_train:
-                    for i in range(self.title_slot_num):
-                        nt_slot = generate_ids(self.ids_per_slot,
-                                               self.sparse_feature_dim)
-                        nt_slot = [
-                            str(fea) + ':' +
-                            str(i + self.query_slot_num + self.title_slot_num)
-                            for fea in nt_slot
-                        ]
-                        neg_title_slots += nt_slot
-                    yield query_slots + pos_title_slots + neg_title_slots
-                else:
-                    yield query_slots + pos_title_slots
-
-        return reader
-
-    def train(self):
-        return self._reader_creator(True)
-
-    def valid(self):
-        return self._reader_creator(True)
-
-    def test(self):
-        return self._reader_creator(False)
-
-
-if __name__ == '__main__':
-    sparse_feature_dim = 1000001
-    query_slots = 1
-    title_slots = 1
-    dataset_size = 10
-    dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
-                               dataset_size)
-    train_reader = dataset.train()
-    test_reader = dataset.test()
-
-    with open("data/train/train.txt", 'w') as fout:
-        for data in train_reader():
-            fout.write(' '.join(data))
-            fout.write("\n")
-
-    with open("data/test/test.txt", 'w') as fout:
-        for data in test_reader():
-            fout.write(' '.join(data))
-            fout.write("\n")
--- a/models/match/multiview-simnet/reader.py
+++ b/models/match/multiview-simnet/reader.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlerec.core.reader import ReaderBase
-from paddlerec.core.utils import envs
-
-
-class Reader(ReaderBase):
-    def init(self):
-        self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
-                                               None, "train.model")
-        self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
-                                               None, "train.model")
-
-        self.all_slots = []
-        for i in range(self.query_slots):
-            self.all_slots.append(str(i))
-
-        for i in range(self.title_slots):
-            self.all_slots.append(str(i + self.query_slots))
-
-        for i in range(self.title_slots):
-            self.all_slots.append(str(i + self.query_slots + self.title_slots))
-
-        self._all_slots_dict = dict()
-        for index, slot in enumerate(self.all_slots):
-            self._all_slots_dict[slot] = [False, index]
-
-    def generate_sample(self, line):
-        def data_iter():
-            elements = line.rstrip().split()
-            padding = 0
-            output = [(slot, []) for slot in self.all_slots]
-            for elem in elements:
-                feasign, slot = elem.split(':')
-                if not self._all_slots_dict.has_key(slot):
-                    continue
-                self._all_slots_dict[slot][0] = True
-                index = self._all_slots_dict[slot][1]
-                output[index][1].append(int(feasign))
-            for slot in self._all_slots_dict:
-                visit, index = self._all_slots_dict[slot]
-                if visit:
-                    self._all_slots_dict[slot][0] = False
-                else:
-                    output[index][1].append(padding)
-            yield output
-
-        return data_iter
--- a/models/match/multiview-simnet/readme.md
+++ b/models/match/multiview-simnet/readme.md
+# multiview-simnet文本匹配模型
+
+以下是本例的简要目录结构及说明： 
+
+```
+├── data #样例数据
+	├── train
+		├── train.txt #训练数据样例
+	├── test
+    	├── test.txt #测试数据样例
+	├── preprocess.py #数据处理程序
+├── __init__.py
+├── README.md #文档
+├── model.py #模型文件
+├── config.yaml #配置文件
+├── run.sh #运行脚本
+├── eval.py #评价脚本
+```
+注：在阅读该示例前，建议您先了解以下内容：
+
+[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [运行环境](#运行环境)
+- [快速开始](#快速开始)
+- [效果复现](#效果复现)
+- [进阶使用](#进阶使用)
+- [FAQ](#FAQ)
+
+
+## 模型简介
+在个性化推荐场景中，推荐系统给用户提供的项目（Item）列表通常是通过个性化的匹配模型计算出来的。在现实世界中，一个用户可能有很多个视角的特征，比如用户Id，年龄，项目的点击历史等。一个项目，举例来说，新闻资讯，也会有多种视角的特征比如新闻标题，新闻类别等。多视角Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。 多视角Simnet模型包括多个编码器模块，每个编码器被用在不同的特征视角上。当前，项目中提供Bag-of-Embedding编码器，Temporal-Convolutional编码器，和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法，当前采用的是Pairwise ranking模式进行训练，即针对一对具有关联的User-Item组合，随机实用一个Item作为负例进行排序学习。 
+
+模型的具体细节可以阅读论文[MultiView-Simnet](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp1159-songA.pdf):
+<p align="center">
+<img align="center" src="../../../doc/imgs/multiview-simnet.png">
+<p>
+
+## 数据准备
+我们公开了自建的测试集，包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
+```
+wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
+tar xzf simnet_dataset-1.0.0.tar.gz
+rm simnet_dataset-1.0.0.tar.gz
+```
+
+数据格式为一个标识句子的slot，后跟一个句子中词的token。两者形成{slot：token}的形式标识一个词：  
+```
+0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:1144 1:217 1:206 1:9 1:3 1:207 1:10 1:398 1:2 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
+0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:951 1:952 1:206 1:9 1:3 1:207 1:10 1:398 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
+```
+
+## 运行环境
+PaddlePaddle>=1.7.2  
+python 2.7  
+PaddleRec >=0.1  
+os : linux  
+
+## 快速开始
+本文提供了样例数据可以供您快速体验，在paddlerec目录下直接执行下面的命令即可启动训练： 
+
+```
+python -m paddlerec.run -m models/match/multiview-simnet/config.yaml
+```   
+
+
+## 效果复现
+为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
+1. 确认您当前所在目录为PaddleRec/models/match/multiview-simnet
+2. 在data目录下载并解压数据集，命令如下：  
+``` 
+cd data
+wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
+tar xzf simnet_dataset-1.0.0.tar.gz
+rm -f simnet_dataset-1.0.0.tar.gz
+mv data/zhidao ./
+rm -rf data
+```
+3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本，您在解压数据集后，可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。命令如下：
+```
+python3 preprocess.py
+rm -f ./train/train.txt
+mv train.txt ./train
+rm -f ./test/test.txt
+mv test.txt ./test
+cd ..
+```
+4. 退回tagspace目录中，打开文件config.yaml,更改其中的参数  
+
+将workspace改为您当前的绝对路径。（可用pwd命令获取绝对路径）  
+
+5.  执行脚本，开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练，并将结果输出到result文件中。然后启动评价脚本eval.py计算auc：
+```
+sh run.sh
+```
+
+运行结果大致如下：
+```
+('auc = ', 0.5944897959183673)
+```
+## 进阶使用
+  
+## FAQ
--- a/models/match/multiview-simnet/run.sh
+++ b/models/match/multiview-simnet/run.sh
+#!/bin/bash
+echo "................run................."
+python -m paddlerec.run -m ./config.yaml >result1.txt
+grep -i "query_pt_sim" ./result1.txt >./result2.txt
+sed '$d' result2.txt >result.txt
+rm -f result1.txt
+rm -f result2.txt
+python eval.py