In [1]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [2]:
# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split="train")
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split="train")

In [3]:
for dict_batch in ratings:
 print(dict_batch)
 break

{'bucketized_user_age': , 'movie_genres': , 'movie_id': , 'movie_title': , 'raw_user_age': , 'timestamp': , 'user_gender': , 'user_id': , 'user_occupation_label': , 'user_occupation_text': , 'user_rating': , 'user_zip_code': }


In [4]:
dict_batch.keys()

dict_keys(['bucketized_user_age', 'movie_genres', 'movie_id', 'movie_title', 'raw_user_age', 'timestamp', 'user_gender', 'user_id', 'user_occupation_label', 'user_occupation_text', 'user_rating', 'user_zip_code'])

In [5]:
dict_batch['user_id']



In [6]:
# Select the basic features.
ratings = ratings.map(lambda x: {
 "movie_title": x["movie_title"],
 "user_id": x["user_id"]
})
movies = movies.map(lambda x: x["movie_title"])

In [7]:
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

In [8]:
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"])) # 944

In [9]:
user_ids = ratings.batch(10000).map(lambda x: x["user_id"])

In [10]:
list(user_ids)

[,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
 ]

In [11]:
np.concatenate(list(user_ids))

array([b'138', b'92', b'301', ..., b'262', b'911', b'276'], dtype=object)

In [12]:
unique_user_ids = np.unique(np.concatenate(list(user_ids))) # 943

In [13]:
movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

In [14]:
movie_titles_vocabulary.adapt(movies) # 1665

In [15]:
movie_titles = movies.batch(1_000)

In [16]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles))) # 1664

In [17]:
class MovieLensModel(tfrs.Model):
 # We derive from a custom base class to help reduce boilerplate. Under the hood,
 # these are still plain Keras Models.

 def __init__(
 self,
 user_model: tf.keras.Model,
 movie_model: tf.keras.Model,
 task: tfrs.tasks.Retrieval):
 super().__init__()

 # Set up user and movie representations.
 self.user_model = user_model
 self.movie_model = movie_model

 # Set up a retrieval task.
 self.task = task

 def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
 # Define how the loss is computed.

 user_embeddings = self.user_model(features["user_id"])
 movie_embeddings = self.movie_model(features["movie_title"])

 return self.task(user_embeddings, movie_embeddings)

In [18]:
# Define user and movie models.
user_model = tf.keras.Sequential([
 user_ids_vocabulary,
 tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 64)
])
movie_model = tf.keras.Sequential([
 movie_titles_vocabulary,
 tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
 movies.batch(128).map(movie_model)
 )
)









In [19]:
user_ids_vocabulary(dict_batch['user_id'])



In [20]:
dict_batch['user_id']



In [21]:
movie_titles_vocabulary(dict_batch['movie_title'])



In [22]:
dict_batch['movie_title']



In [23]:
user_model(dict_batch['user_id'])



In [24]:
movie_model(dict_batch['movie_title'])



In [25]:
model = MovieLensModel(user_model, movie_model, task)

In [26]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [27]:
ratings_ds = ratings.batch(4096)

In [28]:
ratings_batch = next(iter(ratings_ds))

In [29]:
ratings_batch['movie_title']



In [30]:
model.fit(ratings_ds, epochs=3)

Epoch 1/3
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Epoch 2/3
Epoch 3/3




In [31]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

In [32]:
index.index(movies.batch(100).map(model.movie_model), movies)



In [33]:
_, titles = index(np.array(["42", "43"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")

Top 3 recommendations for user 42: [b'Just Cause (1995)'
 b'Far From Home: The Adventures of Yellow Dog (1995)'
 b'Rent-a-Kid (1995)']


In [34]:
titles

