提交 8c0a08c5 编写于 作者: O O2Dyokii

rs_fm

上级 862bfc69
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Factorization Machines with tf\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 根据user/items的id,建立稀疏矩阵\n",
"参考:https://gist.github.com/babakx/7a3fc9739b7778f6673a458605e18963"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from itertools import count\n",
"from collections import defaultdict\n",
"from scipy.sparse import csr\n",
"import numpy as np\n",
"\n",
"def vectorize_dic(dic,ix=None,p=None,n=0,g=0):\n",
" # dic -- dictionary of feature lists. Keys are the name of features\n",
" # ix -- index generator (default None)\n",
" # p -- dimension of feature space (number of columns in the sparse matrix) (default None)\n",
" # n -- num sample\n",
" # g -- num group: eg: uese/items---> g=2\n",
" \n",
" if ix==None:\n",
" ix = dict()\n",
" \n",
" \n",
" nz = n * g # number of non-zores\n",
"\n",
" col_ix = np.empty(nz,dtype = int)\n",
"\n",
" i = 0\n",
" for k,lis in dic.items():\n",
" for t in range(len(lis)):\n",
" ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1\n",
" # 附加索引'l'以防止将具有相同id的不同列映射到同一个索引\n",
" col_ix[i+t*g] = ix[str(lis[t]) + str(k)]\n",
" i += 1\n",
"\n",
" row_ix = np.repeat(np.arange(0,n),g)\n",
" data = np.ones(nz)\n",
" if p == None:\n",
" p = len(ix)\n",
"\n",
" ixx = np.where(col_ix < p)\n",
" return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loading data\n",
"使用MovieLens100k的数据,将数据转化成稀疏矩阵"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"\n",
"cols = ['user','item','rating','timestamp']\n",
"train = pd.read_csv('data/ua.base',delimiter='\\t',names=cols)\n",
"test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n",
"\n",
"x_train = vectorize_dic({'users':train['user'].values, 'items':train['item'].values},n=len(train.index),g=2)\n",
"x_test= vectorize_dic({'users':test['user'].values,'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)\n",
"\n",
"y_train = train.rating.values\n",
"y_test = test.rating.values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Input To Dense\n",
"把输入的x_train和x_test转化成dense格式,使其能被tf使用。"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(90570, 2623) (9430, 2623)\n"
]
}
],
"source": [
"x_train = x_train.todense()\n",
"x_test = x_test.todense()\n",
"\n",
"print(x_train.shape, x_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 用tensorflow定义FM模型"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# 初始化参数\n",
"import tensorflow as tf\n",
"\n",
"n,p = x_train.shape\n",
"# number 0f latent factor\n",
"k = 10\n",
"\n",
"x = tf.placeholder('float',[None,p])\n",
"y = tf.placeholder('float',[None,1])\n",
"\n",
"# bias and weight\n",
"w0 = tf.Variable(tf.zeros([1]))\n",
"w = tf.Variable(tf.zeros([p]))\n",
"\n",
"#interaction factors\n",
"v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))\n",
"\n",
"y_hat = tf.Variable(tf.zeros([n, 1]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 定义输出y的计算公式\n",
"$$ \\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From <ipython-input-19-174f8e1533a4>:2: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"keep_dims is deprecated, use keepdims instead\n"
]
}
],
"source": [
"# 计算FM公式的输出\n",
"linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True))\n",
"pair_interactions = 0.5 * tf.reduce_sum(\n",
" tf.subtract(\n",
" tf.pow(tf.matmul(x,tf.transpose(v)),2),\n",
" tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))),axis=1, keep_dims=True)\n",
"y_hat = tf.add(linear_terms, pair_interactions)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loss function\n",
"$$ L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# L2 reg sum of squares of loss function\n",
"lambda_w = tf.constant(0.001, name='lambda_w')\n",
"lambda_v = tf.constant(0.001, name='lambda_v')\n",
"\n",
"l2_norm = tf.reduce_sum(\n",
" tf.add(\n",
" tf.multiply(lambda_w, tf.pow(w,2)),\n",
" tf.multiply(lambda_v, tf.pow(v,2))))\n",
"error = tf.reduce_mean(tf.square(tf.subtract(y,y_hat)))\n",
"loss = tf.add(error,l2_norm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Optimization\n",
"用SGD进行优化: $\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Mini-batcher"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def batcher(X_,y_=None,batch_size=-1):\n",
" n_samples = X_.shape[0]\n",
" if batch_size == -1:\n",
" batch_size = n_samples\n",
" if batch_size < 1:\n",
" raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n",
" \n",
" for i in range(0,n_samples,batch_size):\n",
" upper_bound = min(i + batch_size,n_samples)\n",
" ret_x = X_[i:upper_bound]\n",
" ret_y = None\n",
" if y_ is not None:\n",
" ret_y = y_[i:i + batch_size]\n",
" yield (ret_x,ret_y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tensorflow graph and traing"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6ccb183e948f4aa88f20186a0a31ffe8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"from tqdm import tqdm_notebook as tqdm\n",
"\n",
"epochs = 10\n",
"batch_size = 1000\n",
"\n",
"# tf graph\n",
"init = tf.global_variables_initializer()\n",
"sess = tf.Session()\n",
"\n",
"sess.run(init)\n",
"\n",
"for epochs in tqdm(range(epochs),unit='epoch'):\n",
" perm = np.random.permutation(x_train.shape[0])\n",
" # iterate over batches\n",
" for bX,bY in batcher(x_train[perm],y_train[perm],batch_size):\n",
" sess.run(optimizer, feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 评价模型\n",
"用RMSE评价"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1.1257708]\n"
]
}
],
"source": [
"errors = []\n",
"for bX,bY in batcher(x_test,y_test):\n",
" errors.append(sess.run(error,feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)}))\n",
"\n",
"RMSE = np.sqrt(np.array(errors))\n",
"print(RMSE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Note: https://kaiyuanyokii2n.com/FM.html#more
此差异已折叠。
此差异已折叠。
## 实现一些推荐算法的模型demo
笔记可以看:https://kaiyuanyokii2n.com/
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册