In [1]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [2]:
# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split="train")
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split="train")

In [3]:
for dict_batch in ratings:
    print(dict_batch)
    break

{'bucketized_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=45.0>, 'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([7])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'357'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">, 'raw_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=46.0>, 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=879024327>, 'user_gender': <tf.Tensor: shape=(), dtype=bool, numpy=True>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'138'>, 'user_occupation_label': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'user_occupation_text': <tf.Tensor: shape=(), dtype=string, numpy=b'doctor'>, 'user_rating': <tf.Tensor: shape=(), dtype=float32, numpy=4.0>, 'user_zip_code': <tf.Tensor: shape=(), dtype=string, numpy=b'53211'>}


In [4]:
dict_batch.keys()

dict_keys(['bucketized_user_age', 'movie_genres', 'movie_id', 'movie_title', 'raw_user_age', 'timestamp', 'user_gender', 'user_id', 'user_occupation_label', 'user_occupation_text', 'user_rating', 'user_zip_code'])

In [5]:
dict_batch['user_id']

<tf.Tensor: shape=(), dtype=string, numpy=b'138'>

In [6]:
# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})
movies = movies.map(lambda x: x["movie_title"])

In [7]:
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

In [8]:
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"]))   # 944

In [9]:
user_ids = ratings.batch(10000).map(lambda x: x["user_id"])

In [10]:
list(user_ids)

[<tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'138', b'92', b'301', ..., b'648', b'244', b'896'], dtype=object)>,
 <tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'60', b'497', b'43', ..., b'221', b'394', b'843'], dtype=object)>,
 <tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'798', b'299', b'174', ..., b'92', b'334', b'174'], dtype=object)>,
 <tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'798', b'130', b'647', ..., b'231', b'188', b'82'], dtype=object)>,
 <tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'871', b'244', b'660', ..., b'327', b'614', b'28'], dtype=object)>,
 <tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'796', b'523', b'795', ..., b'83', b'398', b'712'], dtype=object)>,
 <tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'807', b'234', b'72', ..., b'176', b'158', b'927'], dtype=object)>,
 <tf.Tensor: shape=(10000,), dtype=string, numpy=array([b'445', b'18', b'707', ..., b'249', b'48', b'308'], dtype=object)>,
 <

In [11]:
np.concatenate(list(user_ids))

array([b'138', b'92', b'301', ..., b'262', b'911', b'276'], dtype=object)

In [12]:
unique_user_ids = np.unique(np.concatenate(list(user_ids)))    # 943

In [13]:
movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

In [14]:
movie_titles_vocabulary.adapt(movies)  # 1665

In [15]:
movie_titles = movies.batch(1_000)

In [16]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))  # 1664

In [17]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, movie_embeddings)

In [18]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 64)
])
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)









In [19]:
user_ids_vocabulary(dict_batch['user_id'])

<tf.Tensor: shape=(), dtype=int64, numpy=561>

In [20]:
dict_batch['user_id']

<tf.Tensor: shape=(), dtype=string, numpy=b'138'>

In [21]:
movie_titles_vocabulary(dict_batch['movie_title'])

<tf.Tensor: shape=(), dtype=int64, numpy=576>

In [22]:
dict_batch['movie_title']

<tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">

In [23]:
user_model(dict_batch['user_id'])

<tf.Tensor: shape=(64,), dtype=float32, numpy=
array([ 0.02092246,  0.01330004, -0.01104026,  0.01947654,  0.02912861,
       -0.00991607,  0.00816537,  0.00370462, -0.01238743,  0.01668283,
       -0.01573145, -0.02591802,  0.00480957, -0.03842024, -0.01969916,
        0.01367648, -0.03533678, -0.03356051,  0.04099574, -0.03840814,
       -0.02776179, -0.04254384, -0.02628747,  0.0423843 ,  0.02145881,
        0.01275916,  0.00325542, -0.02677317, -0.03149624,  0.0078699 ,
       -0.03592194,  0.04344089,  0.00764067, -0.00041021, -0.02955389,
       -0.01077508,  0.0296025 ,  0.03615678, -0.02300107,  0.04892621,
       -0.03009094, -0.03677251,  0.00770546, -0.04230518, -0.02819197,
       -0.03131267,  0.02267143,  0.01238422,  0.00255175, -0.0347008 ,
        0.04993348,  0.0061731 , -0.0475098 ,  0.01446799, -0.00303407,
       -0.02765246,  0.00710671, -0.04645333, -0.03955282,  0.04639938,
        0.00993375, -0.00522555, -0.0302899 ,  0.02171392], dtype=float32)>

In [24]:
movie_model(dict_batch['movie_title'])

<tf.Tensor: shape=(64,), dtype=float32, numpy=
array([ 0.00913445,  0.04378858,  0.04076729, -0.04351962,  0.00422343,
        0.03657113,  0.01862463,  0.03506254, -0.01321954,  0.04385557,
       -0.00345308,  0.01523553, -0.04838085,  0.04184191, -0.04864149,
        0.00213599,  0.04431114, -0.00274039,  0.00660695, -0.01186788,
       -0.01478336,  0.03168315, -0.04875847, -0.00365946, -0.02098386,
       -0.02250336,  0.03528608,  0.02538711, -0.04762761, -0.04180253,
       -0.03502623, -0.0352935 , -0.02946589, -0.00899428, -0.02912351,
        0.00325345,  0.0016913 ,  0.01136813, -0.02269085, -0.03098171,
        0.03672192,  0.02978219, -0.03851994,  0.04898627,  0.04050212,
       -0.03917491,  0.01909404, -0.01818258, -0.0471717 , -0.00783293,
        0.04632254, -0.04985919,  0.00077146, -0.03853441, -0.02321854,
       -0.04326954, -0.01016202, -0.03939896, -0.02222459,  0.02341056,
       -0.04016956, -0.04010973, -0.01156746, -0.04070725], dtype=float32)>

In [25]:
model = MovieLensModel(user_model, movie_model, task)

In [26]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [27]:
ratings_ds = ratings.batch(4096)

In [28]:
ratings_batch = next(iter(ratings_ds))

In [29]:
ratings_batch['movie_title']

<tf.Tensor: shape=(4096,), dtype=string, numpy=
array([b"One Flew Over the Cuckoo's Nest (1975)",
       b'Strictly Ballroom (1992)', b'Very Brady Sequel, A (1996)', ...,
       b'Godfather, The (1972)', b'Die Hard: With a Vengeance (1995)',
       b'Jerry Maguire (1996)'], dtype=object)>

In [30]:
model.fit(ratings_ds, epochs=3)

Epoch 1/3
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f4df013bd68>

In [31]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

In [32]:
index.index(movies.batch(100).map(model.movie_model), movies)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f4ddc2d1668>

In [33]:
_, titles = index(np.array(["42", "43"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")

Top 3 recommendations for user 42: [b'Just Cause (1995)'
 b'Far From Home: The Adventures of Yellow Dog (1995)'
 b'Rent-a-Kid (1995)']


In [34]:
titles

<tf.Tensor: shape=(2, 10), dtype=string, numpy=
array([[b'Just Cause (1995)',
        b'Far From Home: The Adventures of Yellow Dog (1995)',
        b'Rent-a-Kid (1995)', b'Mirage (1995)',
        b'Winnie the Pooh and the Blustery Day (1968)',
        b'Aristocats, The (1970)', b'Indian in the Cupboard, The (1995)',
        b'Nell (1994)', b'Dolores Claiborne (1994)',
        b'Love in the Afternoon (1957)'],
       [b'Mr. Wrong (1996)', b'Affair to Remember, An (1957)',
        b"Fathers' Day (1997)", b'Big Bully (1996)',
        b'Bed of Roses (1996)', b'Don Juan DeMarco (1995)',
        b'Only You (1994)', b'Indian in the Cupboard, The (1995)',
        b'Out to Sea (1997)', b'Corrina, Corrina (1994)']], dtype=object)>