rs_fm

8c0a08c5 · O2Dyokii · 862bfc69 · 8c0a08c5 · 8c0a08c5 · 8c0a08c5
5 changed file
--- a/RS-tf/FM/FM.ipynb
+++ b/RS-tf/FM/FM.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Factorization Machines with tf\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 根据user/items的id，建立稀疏矩阵\n",
+    "参考：https://gist.github.com/babakx/7a3fc9739b7778f6673a458605e18963"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import count\n",
+    "from collections import defaultdict\n",
+    "from scipy.sparse import csr\n",
+    "import numpy as np\n",
+    "\n",
+    "def vectorize_dic(dic,ix=None,p=None,n=0,g=0):\n",
+    "    # dic -- dictionary of feature lists. Keys are the name of features\n",
+    "    # ix -- index generator (default None)\n",
+    "    # p -- dimension of feature space (number of columns in the sparse matrix) (default None)\n",
+    "    # n -- num sample\n",
+    "    # g -- num group: eg: uese/items---> g=2\n",
+    "    \n",
+    "    if ix==None:\n",
+    "        ix = dict()\n",
+    "    \n",
+    "    \n",
+    "    nz = n * g # number of non-zores\n",
+    "\n",
+    "    col_ix = np.empty(nz,dtype = int)\n",
+    "\n",
+    "    i = 0\n",
+    "    for k,lis in dic.items():\n",
+    "        for t in range(len(lis)):\n",
+    "            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1\n",
+    "            # 附加索引'l'以防止将具有相同id的不同列映射到同一个索引\n",
+    "            col_ix[i+t*g] = ix[str(lis[t]) + str(k)]\n",
+    "        i += 1\n",
+    "\n",
+    "    row_ix = np.repeat(np.arange(0,n),g)\n",
+    "    data = np.ones(nz)\n",
+    "    if p == None:\n",
+    "        p = len(ix)\n",
+    "\n",
+    "    ixx = np.where(col_ix < p)\n",
+    "    return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading data\n",
+    "使用MovieLens100k的数据，将数据转化成稀疏矩阵"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction import DictVectorizer\n",
+    "\n",
+    "cols = ['user','item','rating','timestamp']\n",
+    "train = pd.read_csv('data/ua.base',delimiter='\\t',names=cols)\n",
+    "test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n",
+    "\n",
+    "x_train = vectorize_dic({'users':train['user'].values, 'items':train['item'].values},n=len(train.index),g=2)\n",
+    "x_test= vectorize_dic({'users':test['user'].values,'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)\n",
+    "\n",
+    "y_train = train.rating.values\n",
+    "y_test = test.rating.values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Input To Dense\n",
+    "把输入的x_train和x_test转化成dense格式，使其能被tf使用。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(90570, 2623) (9430, 2623)\n"
+     ]
+    }
+   ],
+   "source": [
+    "x_train = x_train.todense()\n",
+    "x_test = x_test.todense()\n",
+    "\n",
+    "print(x_train.shape, x_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 用tensorflow定义FM模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 初始化参数\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "n,p = x_train.shape\n",
+    "# number 0f latent factor\n",
+    "k = 10\n",
+    "\n",
+    "x = tf.placeholder('float',[None,p])\n",
+    "y = tf.placeholder('float',[None,1])\n",
+    "\n",
+    "# bias and weight\n",
+    "w0 = tf.Variable(tf.zeros([1]))\n",
+    "w = tf.Variable(tf.zeros([p]))\n",
+    "\n",
+    "#interaction factors\n",
+    "v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))\n",
+    "\n",
+    "y_hat = tf.Variable(tf.zeros([n, 1]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 定义输出y的计算公式\n",
+    "$$ \\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-19-174f8e1533a4>:2: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "keep_dims is deprecated, use keepdims instead\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 计算FM公式的输出\n",
+    "linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True))\n",
+    "pair_interactions = 0.5 * tf.reduce_sum(\n",
+    "                            tf.subtract(\n",
+    "                                tf.pow(tf.matmul(x,tf.transpose(v)),2),\n",
+    "                                        tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))),axis=1, keep_dims=True)\n",
+    "y_hat = tf.add(linear_terms, pair_interactions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loss function\n",
+    "$$ L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# L2 reg sum of squares of loss function\n",
+    "lambda_w = tf.constant(0.001, name='lambda_w')\n",
+    "lambda_v = tf.constant(0.001, name='lambda_v')\n",
+    "\n",
+    "l2_norm = tf.reduce_sum(\n",
+    "                    tf.add(\n",
+    "                        tf.multiply(lambda_w, tf.pow(w,2)),\n",
+    "                        tf.multiply(lambda_v, tf.pow(v,2))))\n",
+    "error = tf.reduce_mean(tf.square(tf.subtract(y,y_hat)))\n",
+    "loss = tf.add(error,l2_norm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Optimization\n",
+    "用SGD进行优化: $\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Mini-batcher"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def batcher(X_,y_=None,batch_size=-1):\n",
+    "    n_samples = X_.shape[0]\n",
+    "    if batch_size == -1:\n",
+    "        batch_size = n_samples\n",
+    "    if batch_size < 1:\n",
+    "        raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n",
+    "        \n",
+    "    for i in range(0,n_samples,batch_size):\n",
+    "        upper_bound = min(i + batch_size,n_samples)\n",
+    "        ret_x = X_[i:upper_bound]\n",
+    "        ret_y = None\n",
+    "        if y_ is not None:\n",
+    "            ret_y = y_[i:i + batch_size]\n",
+    "            yield (ret_x,ret_y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tensorflow graph and traing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6ccb183e948f4aa88f20186a0a31ffe8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "\n",
+    "epochs = 10\n",
+    "batch_size = 1000\n",
+    "\n",
+    "# tf graph\n",
+    "init = tf.global_variables_initializer()\n",
+    "sess = tf.Session()\n",
+    "\n",
+    "sess.run(init)\n",
+    "\n",
+    "for epochs in tqdm(range(epochs),unit='epoch'):\n",
+    "    perm = np.random.permutation(x_train.shape[0])\n",
+    "    # iterate over batches\n",
+    "    for bX,bY in batcher(x_train[perm],y_train[perm],batch_size):\n",
+    "        sess.run(optimizer, feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 评价模型\n",
+    "用RMSE评价"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1.1257708]\n"
+     ]
+    }
+   ],
+   "source": [
+    "errors = []\n",
+    "for bX,bY in batcher(x_test,y_test):\n",
+    "    errors.append(sess.run(error,feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)}))\n",
+    "\n",
+    "RMSE = np.sqrt(np.array(errors))\n",
+    "print(RMSE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/RS-tf/FM/README.md
+++ b/RS-tf/FM/README.md
+Note： https://kaiyuanyokii2n.com/FM.html#more
--- a/RS-tf/FM/data/ua.base
+++ b/RS-tf/FM/data/ua.base
--- a/RS-tf/FM/data/ua.test
+++ b/RS-tf/FM/data/ua.test
--- a/RS-tf/README.md
+++ b/RS-tf/README.md
+## 实现一些推荐算法的模型demo
+
+笔记可以看：https://kaiyuanyokii2n.com/
\ No newline at end of file