From d831712afc8850ff0004e56766e5146772a8930a Mon Sep 17 00:00:00 2001 From: overlordmax <37664905+overlordmax@users.noreply.github.com> Date: Tue, 19 May 2020 11:51:31 +0800 Subject: [PATCH] Listwise 05191124 (#4643) * fix bugs * fix bugs * add wide_deep * fix code style * fix code style * fix some bugs * fix filename * add ncf * add download data * add download data * add youtube dnn * edit README.md * fix some bugs * add listwise * fix code style --- PaddleRec/rerank/listwise/README.md | 133 +++++++++++++++++++++++++ PaddleRec/rerank/listwise/args.py | 39 ++++++++ PaddleRec/rerank/listwise/evaluator.py | 118 ++++++++++++++++++++++ PaddleRec/rerank/listwise/infer.py | 82 +++++++++++++++ PaddleRec/rerank/listwise/infer_cpu.sh | 1 + PaddleRec/rerank/listwise/infer_gpu.sh | 1 + PaddleRec/rerank/listwise/train.py | 70 +++++++++++++ PaddleRec/rerank/listwise/train_cpu.sh | 1 + PaddleRec/rerank/listwise/train_gpu.sh | 1 + PaddleRec/rerank/listwise/utils.py | 36 +++++++ 10 files changed, 482 insertions(+) create mode 100644 PaddleRec/rerank/listwise/README.md create mode 100644 PaddleRec/rerank/listwise/args.py create mode 100644 PaddleRec/rerank/listwise/evaluator.py create mode 100644 PaddleRec/rerank/listwise/infer.py create mode 100644 PaddleRec/rerank/listwise/infer_cpu.sh create mode 100644 PaddleRec/rerank/listwise/infer_gpu.sh create mode 100644 PaddleRec/rerank/listwise/train.py create mode 100644 PaddleRec/rerank/listwise/train_cpu.sh create mode 100644 PaddleRec/rerank/listwise/train_gpu.sh create mode 100644 PaddleRec/rerank/listwise/utils.py diff --git a/PaddleRec/rerank/listwise/README.md b/PaddleRec/rerank/listwise/README.md new file mode 100644 index 00000000..360283ef --- /dev/null +++ b/PaddleRec/rerank/listwise/README.md @@ -0,0 +1,133 @@ +# listwise + + 以下是本例的简要目录结构及说明: + +``` +├── README.md # 文档 +├── evaluator.py # biRnn网络文件 +├── utils.py # 通用函数 +├── args.py # 参数脚本 +├── train.py # 训练文件 +├── infer.py # 预测文件 +├── train_gpu.sh # gpu训练shell脚本 +├── train_cpu.sh # cpu训练shell脚本 +├── infer_gpu.sh # gpu预测shell脚本 +├── infer_cpu.sh # cpu预测shell脚本 +``` + +## 简介 + +[《Sequential Evaluation and Generation Framework for Combinatorial Recommender System》]( https://arxiv.org/pdf/1902.00245.pdf)是百度2019年发布的推荐系统融合模型,用于优化推荐序列的整体性能(如总点击),该模型由Generator和Evaluator两部分组成,Generator负责生成若干个候选序列,Evaluator负责从候选序列中筛选出最好的序列推荐给用户,达到最大化序列整体性能的目的。 + +本项目在paddlepaddle上实现该融合模型的Evaluator部分,构造数据集验证模型的正确性。 + +## 环境 + + PaddlePaddle 1.7.0 + + python3.7 + +## 单机训练 + +GPU环境 + +在train_gpu.sh脚本文件中设置好数据路径、参数。 + +```sh +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ #使用gpu + --epochs 3\ + --batch_size 32\ + --model_dir './model_dir'\ #模型保存路径 + --embd_dim 16\ #embedding维度 + --hidden_size 128\ #biRNN隐层大小 + --item_vocab 200\ #item词典大小 + --user_vocab 200\ #user词典大小 + --item_len 5\ #序列长度 + --sample_size 100\ #构造数据集大小 + --base_lr 0.01 #学习率 + +``` + +修改脚本的可执行权限并运行 + +``` +./train_gpu.sh +``` + +CPU环境 + +在train_cpu.sh脚本文件中设置好数据路径、参数。 + +```sh +python train.py --use_gpu 0\ #使用cpu + --epochs 3\ + --batch_size 32\ + --model_dir './model_dir'\ #模型保存路径 + --embd_dim 16\ #embedding维度 + --hidden_size 128\ #biRNN隐层大小 + --item_vocab 200\ #item词典大小 + --user_vocab 200\ #user词典大小 + --item_len 5\ #序列长度 + --sample_size 100\ #构造数据集大小 + --base_lr 0.01 #学习率 + +``` + +修改脚本的可执行权限并运行 + +```sh +./train_cpu.sh +``` + +## 单机预测 + +GPU环境 + +在infer_gpu.sh脚本文件中设置好数据路径、参数。 + +```sh +CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ #使用gpu + --model_dir './model_dir'\ + --test_epoch 19 #选择哪一轮的模型参数 + +``` + +修改脚本的可执行权限并运行 + +```sh +./infer_gpu.sh +``` + +CPU环境 + +在infer_cpu.sh脚本文件中设置好数据路径、参数。 + +```sh +python infer.py --use_gpu 0\ #使用cpu + --model_dir './model_dir'\ + --test_epoch 19 #选择哪一轮的模型参数 + +``` + +修改脚本的可执行权限并运行 + +``` +./infer_cpu.sh +``` + +## 模型效果 + +在测试集的效果如下: + +``` +W0518 21:38:58.030905 8105 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 9.2, Runtime API Version: 9.0 +W0518 21:38:58.035158 8105 device_context.cc:245] device: 0, cuDNN Version: 7.3. +2020-05-18 21:38:59,553-INFO: epoch_id: 0, batch_time: 0.01643s, loss: 0.69452, auc: 0.47282 +2020-05-18 21:38:59,567-INFO: epoch_id: 0, batch_time: 0.01314s, loss: 0.77172, auc: 0.49025 +2020-05-18 21:38:59,580-INFO: epoch_id: 0, batch_time: 0.01261s, loss: 0.69282, auc: 0.51839 +...... +2020-05-18 21:39:03,702-INFO: epoch_id: 2, batch_time: 0.01287s, loss: 0.69431, auc: 0.50265 +2020-05-18 21:39:03,715-INFO: epoch_id: 2, batch_time: 0.01278s, loss: 0.69272, auc: 0.50267 +2020-05-18 21:39:03,728-INFO: epoch_id: 2, batch_time: 0.01274s, loss: 0.69340, auc: 0.50267 +``` + diff --git a/PaddleRec/rerank/listwise/args.py b/PaddleRec/rerank/listwise/args.py new file mode 100644 index 00000000..cad45797 --- /dev/null +++ b/PaddleRec/rerank/listwise/args.py @@ -0,0 +1,39 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util +import sys + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--epochs", type=int, default=20, help="epochs") + parser.add_argument("--batch_size", type=int, default=32, help="batch_size") + parser.add_argument("--test_epoch", type=int, default=19, help="test_epoch") + parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') + parser.add_argument('--model_dir', type=str, default='./model_dir', help='model_dir') + parser.add_argument('--embd_dim', type=int, default=16, help='embd_dim') + parser.add_argument('--hidden_size', type=int, default=128, help='hidden_size') + parser.add_argument('--item_vocab', type=int, default=200, help='item_vocab') + parser.add_argument('--user_vocab', type=int, default=200, help='user_vocab') + parser.add_argument('--item_len', type=int, default=5, help='item_len') + parser.add_argument('--sample_size', type=int, default=100, help='sample_size') + parser.add_argument('--base_lr', type=float, default=0.01, help='base_lr') + + args = parser.parse_args() + return args \ No newline at end of file diff --git a/PaddleRec/rerank/listwise/evaluator.py b/PaddleRec/rerank/listwise/evaluator.py new file mode 100644 index 00000000..d7d6c09a --- /dev/null +++ b/PaddleRec/rerank/listwise/evaluator.py @@ -0,0 +1,118 @@ +from paddle import fluid +import utils +import numpy as np + +class BiRNN(object): + def input_data(self, item_len): + user_slot_names = fluid.data(name='user_slot_names', shape=[None, 1], dtype='int64', lod_level=1) + item_slot_names = fluid.data(name='item_slot_names', shape=[None, item_len], dtype='int64', lod_level=1) + lens = fluid.data(name='lens', shape=[None], dtype='int64') + labels = fluid.data(name='labels', shape=[None, item_len], dtype='int64', lod_level=1) + + inputs = [user_slot_names] + [item_slot_names] + [lens] + [labels] + + return inputs + + def default_normal_initializer(self, nf=128): + return fluid.initializer.TruncatedNormal(loc=0.0, scale=np.sqrt(1.0/nf)) + + def default_param_clip(self): + return fluid.clip.GradientClipByValue(1.0) + + def default_regularizer(self): + return None + + def default_fc(self, data, size, num_flatten_dims=1, act=None, name=None): + return fluid.layers.fc(input=data, + size=size, + num_flatten_dims=num_flatten_dims, + param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(size), + gradient_clip=self.default_param_clip(), + regularizer=self.default_regularizer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0), + gradient_clip=self.default_param_clip(), + regularizer=self.default_regularizer()), + act=act, + name=name) + + def default_embedding(self, data, vocab_size, embed_size): + gradient_clip = self.default_param_clip() + reg = fluid.regularizer.L2Decay(1e-5) # IMPORTANT, to prevent overfitting. + embed = fluid.embedding(input=data, + size=[vocab_size, embed_size], + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Xavier(), + gradient_clip=gradient_clip, + regularizer=reg), + is_sparse=True) + + return embed + + def default_drnn(self, data, nf, is_reverse, h_0): + return fluid.layers.dynamic_gru(input=data, + size=nf, + param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(nf), + gradient_clip=self.default_param_clip(), + regularizer=self.default_regularizer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0), + gradient_clip=self.default_param_clip(), + regularizer=self.default_regularizer()), + is_reverse=is_reverse, + h_0=h_0) + + def net(self, inputs, hidden_size, user_vocab, item_vocab, embed_size): + #encode + user_embedding = self.default_embedding(inputs[0], user_vocab, embed_size) + user_feature = self.default_fc(data=user_embedding, + size=hidden_size, + num_flatten_dims=1, + act='relu', + name='user_feature_fc') + + item_embedding = self.default_embedding(inputs[1], item_vocab, embed_size) + item_embedding = fluid.layers.sequence_unpad(x=item_embedding, length=inputs[2]) + + item_fc = self.default_fc(data=item_embedding, + size=hidden_size, + num_flatten_dims=1, + act='relu', + name='item_fc') + + pos = utils.fluid_sequence_get_pos(item_fc) + pos_embed = self.default_embedding(pos, user_vocab, embed_size) + pos_embed = fluid.layers.squeeze(pos_embed, [1]) + + # item gru + gru_input = self.default_fc(data=fluid.layers.concat([item_fc, pos_embed], 1), + size=hidden_size * 3, + num_flatten_dims=1, + act='relu', + name='item_gru_fc') + + item_gru_forward = self.default_drnn(data=gru_input, + nf=hidden_size, + h_0=user_feature, + is_reverse=False) + + item_gru_backward = self.default_drnn(data=gru_input, + nf=hidden_size, + h_0=user_feature, + is_reverse=True) + item_gru = fluid.layers.concat([item_gru_forward, item_gru_backward], axis=1) + + out_click_fc1 = self.default_fc(data=item_gru, + size=hidden_size, + num_flatten_dims=1, + act='relu', + name='out_click_fc1') + + click_prob = self.default_fc(data=out_click_fc1, + size=2, + num_flatten_dims=1, + act='softmax', + name='out_click_fc2') + + labels = fluid.layers.sequence_unpad(x=inputs[3], length=inputs[2]) + loss = fluid.layers.reduce_mean(fluid.layers.cross_entropy(input=click_prob, label=labels)) + auc_val, batch_auc, auc_states = fluid.layers.auc(input=click_prob, label=labels) + + return loss, auc_val, batch_auc, auc_states diff --git a/PaddleRec/rerank/listwise/infer.py b/PaddleRec/rerank/listwise/infer.py new file mode 100644 index 00000000..fb01f1b4 --- /dev/null +++ b/PaddleRec/rerank/listwise/infer.py @@ -0,0 +1,82 @@ +import numpy as np +import os +import paddle.fluid as fluid +import logging +import args +import random +import time +from evaluator import BiRNN + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def set_zero(var_name, scope=fluid.global_scope(), place=fluid.CPUPlace(), param_type="int64"): + """ + Set tensor of a Variable to zero. + Args: + var_name(str): name of Variable + scope(Scope): Scope object, default is fluid.global_scope() + place(Place): Place object, default is fluid.CPUPlace() + param_type(str): param data type, default is int64 + """ + param = scope.var(var_name).get_tensor() + param_array = np.zeros(param._get_dims()).astype(param_type) + param.set(param_array, place) + +def run_infer(args): + model = BiRNN() + inference_scope = fluid.Scope() + startup_program = fluid.framework.Program() + test_program = fluid.framework.Program() + cur_model_path = os.path.join(args.model_dir, 'epoch_' + str(args.test_epoch), "checkpoint") + with fluid.scope_guard(inference_scope): + with fluid.framework.program_guard(test_program, startup_program): + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + inputs = model.input_data(args.item_len) + loss, auc_val, batch_auc, auc_states = model.net(inputs, args.hidden_size, args.batch_size*args.sample_size, args.item_vocab, args.embd_dim) + exe = fluid.Executor(place) + + fluid.load(fluid.default_main_program(), cur_model_path, exe) + for var in auc_states: # reset auc states + set_zero(var.name, scope=inference_scope, place=place) + + # Build a random data set. + user_slot_names = [] + item_slot_names = [] + lens = [] + labels = [] + user_id = 0 + for i in range(args.sample_size): + user_slot_name = [] + for j in range(args.batch_size): + user_slot_name.append(user_id) + user_id += 1 + user_slot_names.append(user_slot_name) + + item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)) + item_slot_names.append(item_slot_name) + lenght = np.array([args.item_len]*args.batch_size) + lens.append(lenght) + label = np.random.randint(2, size=(args.batch_size, args.item_len)) + labels.append(label) + + for i in range(args.sample_size): + begin = time.time() + loss_val, auc = exe.run(test_program, + feed={ + "user_slot_names": np.array(user_slot_names[i]).reshape(args.batch_size, 1), + "item_slot_names": item_slot_names[i].astype('int64'), + "lens": lens[i].astype('int64'), + "labels": labels[i].astype('int64') + }, + return_numpy=True, + fetch_list=[loss.name, auc_val]) + end = time.time() + logger.info("batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( + end-begin, float(np.array(loss_val)), float(np.array(auc)))) + +if __name__ == "__main__": + + args = args.parse_args() + run_infer(args) \ No newline at end of file diff --git a/PaddleRec/rerank/listwise/infer_cpu.sh b/PaddleRec/rerank/listwise/infer_cpu.sh new file mode 100644 index 00000000..c75466da --- /dev/null +++ b/PaddleRec/rerank/listwise/infer_cpu.sh @@ -0,0 +1 @@ +python infer.py --use_gpu 0 --model_dir './model_dir' --test_epoch 19 diff --git a/PaddleRec/rerank/listwise/infer_gpu.sh b/PaddleRec/rerank/listwise/infer_gpu.sh new file mode 100644 index 00000000..0e61306d --- /dev/null +++ b/PaddleRec/rerank/listwise/infer_gpu.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 --model_dir './model_dir' --test_epoch 19 diff --git a/PaddleRec/rerank/listwise/train.py b/PaddleRec/rerank/listwise/train.py new file mode 100644 index 00000000..314d53a9 --- /dev/null +++ b/PaddleRec/rerank/listwise/train.py @@ -0,0 +1,70 @@ +import numpy as np +import os +import paddle.fluid as fluid +import logging +import args +import random +import time +from evaluator import BiRNN + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def train(args): + + model = BiRNN() + inputs = model.input_data(args.item_len) + loss, auc_val, batch_auc, auc_states = model.net(inputs, args.hidden_size, args.batch_size*args.sample_size, args.item_vocab, args.embd_dim) + + optimizer = fluid.optimizer.Adam(learning_rate=args.base_lr, epsilon=1e-4) + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + # Build a random data set. + user_slot_names = [] + item_slot_names = [] + lens = [] + labels = [] + user_id = 0 + for i in range(args.sample_size): + user_slot_name = [] + for j in range(args.batch_size): + user_slot_name.append(user_id) + user_id += 1 + user_slot_names.append(user_slot_name) + + item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)) + item_slot_names.append(item_slot_name) + lenght = np.array([args.item_len]*args.batch_size) + lens.append(lenght) + label = np.random.randint(2, size=(args.batch_size, args.item_len)) + labels.append(label) + + for epoch in range(args.epochs): + for i in range(args.sample_size): + begin = time.time() + loss_val, auc = exe.run(fluid.default_main_program(), + feed={ + "user_slot_names": np.array(user_slot_names[i]).reshape(args.batch_size, 1), + "item_slot_names": item_slot_names[i].astype('int64'), + "lens": lens[i].astype('int64'), + "labels": labels[i].astype('int64') + }, + return_numpy=True, + fetch_list=[loss.name, auc_val]) + end = time.time() + logger.info("epoch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( + epoch, end-begin, float(np.array(loss_val)), float(np.array(auc)))) + + #save model + model_dir = os.path.join(args.model_dir, 'epoch_' + str(epoch + 1), "checkpoint") + main_program = fluid.default_main_program() + fluid.save(main_program, model_dir) + +if __name__ == "__main__": + args = args.parse_args() + train(args) \ No newline at end of file diff --git a/PaddleRec/rerank/listwise/train_cpu.sh b/PaddleRec/rerank/listwise/train_cpu.sh new file mode 100644 index 00000000..c09c4751 --- /dev/null +++ b/PaddleRec/rerank/listwise/train_cpu.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 0 --epochs 20 --batch_size 32 --model_dir './model_dir' --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 diff --git a/PaddleRec/rerank/listwise/train_gpu.sh b/PaddleRec/rerank/listwise/train_gpu.sh new file mode 100644 index 00000000..2bee22a3 --- /dev/null +++ b/PaddleRec/rerank/listwise/train_gpu.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 --epochs 20 --batch_size 32 --model_dir './model_dir' --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 diff --git a/PaddleRec/rerank/listwise/utils.py b/PaddleRec/rerank/listwise/utils.py new file mode 100644 index 00000000..c2833133 --- /dev/null +++ b/PaddleRec/rerank/listwise/utils.py @@ -0,0 +1,36 @@ +from paddle import fluid +import numpy as np + +def fluid_sequence_pad(input, pad_value, maxlen=None): + """ + args: + input: (batch*seq_len, dim) + returns: + (batch, max_seq_len, dim) + """ + pad_value = fluid.layers.cast(fluid.layers.assign(input=np.array([pad_value], 'float32')), input.dtype) + input_padded, _ = fluid.layers.sequence_pad(input, pad_value, maxlen=maxlen) # (batch, max_seq_len, 1), (batch, 1) + # TODO, maxlen=300, used to solve issues: https://github.com/PaddlePaddle/Paddle/issues/14164 + return input_padded + +def fluid_sequence_get_pos(lodtensor): + """ + args: + lodtensor: lod = [[0,4,7]] + return: + pos: lod = [[0,4,7]] + data = [0,1,2,3,0,1,3] + shape = [-1, 1] + """ + lodtensor = fluid.layers.reduce_sum(lodtensor, dim=1, keep_dim=True) + assert lodtensor.shape == (-1, 1), (lodtensor.shape()) + ones = fluid.layers.cast(lodtensor * 0 + 1, 'float32') # (batch*seq_len, 1) + ones_padded = fluid_sequence_pad(ones, 0) # (batch, max_seq_len, 1) + ones_padded = fluid.layers.squeeze(ones_padded, [2]) # (batch, max_seq_len) + seq_len = fluid.layers.cast(fluid.layers.reduce_sum(ones_padded, 1, keep_dim=True), 'int64') # (batch, 1) + seq_len = fluid.layers.squeeze(seq_len, [1]) + + pos = fluid.layers.cast(fluid.layers.cumsum(ones_padded, 1, exclusive=True), 'int64') + pos = fluid.layers.sequence_unpad(pos, seq_len) # (batch*seq_len, 1) + pos.stop_gradient = True + return pos \ No newline at end of file -- GitLab