In [1]:
import pprint

import tensorflow_datasets as tfds

ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.take(1).as_numpy_iterator():
 pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [2]:
import numpy as np
import tensorflow as tf

movie_title_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()

In [3]:
movie_title_lookup.adapt(ratings.map(lambda x: x["movie_title"])) # 1666

print(f"Vocabulary: {movie_title_lookup.get_vocabulary()[:3]}")

Vocabulary: ['', '[UNK]', 'Star Wars (1977)']


In [4]:
movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
movies = tfds.load('movielens/100k-movies', split="train")
movies = movies.map(lambda x: x["movie_title"])
movie_titles_vocabulary.adapt(movies)

In [5]:
movie_title_lookup.get_vocabulary()

['',
 '[UNK]',
 'Star Wars (1977)',
 'Contact (1997)',
 'Fargo (1996)',
 'Return of the Jedi (1983)',
 'Liar Liar (1997)',
 'English Patient, The (1996)',
 'Scream (1996)',
 'Toy Story (1995)',
 'Air Force One (1997)',
 'Independence Day (ID4) (1996)',
 'Raiders of the Lost Ark (1981)',
 'Godfather, The (1972)',
 'Pulp Fiction (1994)',
 'Twelve Monkeys (1995)',
 'Silence of the Lambs, The (1991)',
 'Jerry Maguire (1996)',
 'Chasing Amy (1997)',
 'Rock, The (1996)',
 'Empire Strikes Back, The (1980)',
 'Star Trek: First Contact (1996)',
 'Titanic (1997)',
 'Back to the Future (1985)',
 'Mission: Impossible (1996)',
 'Fugitive, The (1993)',
 'Indiana Jones and the Last Crusade (1989)',
 'Willy Wonka and the Chocolate Factory (1971)',
 'Princess Bride, The (1987)',
 'Forrest Gump (1994)',
 'Saint, The (1997)',
 'Monty Python and the Holy Grail (1974)',
 'Full Monty, The (1997)',
 'Men in Black (1997)',
 'Terminator, The (1984)',
 'E.T. the Extra-Terrestrial (1982)',
 'Dead Man Walking (19

In [6]:
movie_titles_vocabulary.get_vocabulary()

['[UNK]',
 "Ulee's Gold (1997)",
 'That Darn Cat! (1997)',
 'Substance of Fire, The (1996)',
 'Sliding Doors (1998)',
 'Nightwatch (1997)',
 'Money Talks (1997)',
 'Kull the Conqueror (1997)',
 'Ice Storm, The (1997)',
 'Hurricane Streets (1998)',
 'Hugo Pool (1997)',
 'Fly Away Home (1996)',
 'Desperate Measures (1998)',
 'Designated Mourner, The (1997)',
 'Deceiver (1997)',
 'Chasing Amy (1997)',
 'Chairman of the Board (1998)',
 'Butcher Boy, The (1998)',
 'Body Snatchers (1993)',
 'Á köldum klaka (Cold Fever) (1994)',
 'unknown',
 'Zeus and Roxanne (1997)',
 "Young Poisoner's Handbook, The (1995)",
 'Young Guns II (1990)',
 'Young Guns (1988)',
 'Young Frankenstein (1974)',
 'You So Crazy (1994)',
 'Year of the Horse (1997)',
 'Yankee Zulu (1994)',
 'Wyatt Earp (1994)',
 'Wrong Trousers, The (1993)',
 'World of Apu, The (Apur Sansar) (1959)',
 "Wooden Man's Bride, The (Wu Kui) (1994)",
 'Wonderland (1997)',
 'Wonderful, Horrible Life of Leni Riefenstahl, The (1993)',
 'Women, The (

In [7]:
movie_title_lookup(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])



In [8]:
# We set up a large number of bins to reduce the chance of hash collisions.
num_hashing_bins = 200_000

movie_title_hashing = tf.keras.layers.experimental.preprocessing.Hashing(
 num_bins=num_hashing_bins
)

In [9]:
movie_title_hashing(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])



In [10]:
movie_title_embedding = tf.keras.layers.Embedding(
 # Let's use the explicit vocabulary lookup.
 input_dim=movie_title_lookup.vocab_size(),
 output_dim=32
)





In [11]:
movie_title_model = tf.keras.Sequential([movie_title_lookup, movie_title_embedding])

In [12]:
movie_title_model(["Star Wars (1977)"])

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.




In [13]:
user_id_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()
user_id_lookup.adapt(ratings.map(lambda x: x["user_id"]))

user_id_embedding = tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32)

user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])





In [14]:
for x in ratings.take(3).as_numpy_iterator():
 print(f"Timestamp: {x['timestamp']}.")

Timestamp: 879024327.
Timestamp: 875654590.
Timestamp: 882075110.


In [15]:
timestamp_normalization = tf.keras.layers.experimental.preprocessing.Normalization()
timestamp_normalization.adapt(ratings.map(lambda x: x["timestamp"]).batch(1024))

for x in ratings.take(3).as_numpy_iterator():
 print(f"Normalized timestamp: {timestamp_normalization(x['timestamp'])}.")

Normalized timestamp: [[-0.8429372]].
Normalized timestamp: [[-1.4735202]].
Normalized timestamp: [[-0.27203265]].


In [16]:
max_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
 tf.cast(0, tf.int64), tf.maximum).numpy().max()

In [17]:
max_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
 tf.cast(0, tf.int64), tf.maximum).numpy().max()
min_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
 np.int64(1e9), tf.minimum).numpy().min()

In [18]:
timestamp_buckets = np.linspace(
 min_timestamp, max_timestamp, num=1000)

print(f"Buckets: {timestamp_buckets[:3]}")

Buckets: [8.74724710e+08 8.74743291e+08 8.74761871e+08]


In [19]:
timestamp_embedding_model = tf.keras.Sequential([
 tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
 tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32)
])

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():
 print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}.")

Timestamp embedding: [[-0.04667547 -0.02808862 0.01682058 0.0338834 -0.03198426 0.00383388
 0.03077259 0.01907918 0.04480798 -0.03309586 -0.01344311 0.0165614
 -0.02186432 -0.01408075 0.00864227 -0.03585001 -0.00817011 0.01023756
 0.02895612 -0.040274 0.0173769 0.0120603 -0.01638366 -0.04449072
 -0.01819245 -0.03192252 -0.00627976 0.01466325 0.04082705 0.03110823
 -0.02607706 0.02907329]].


In [20]:
title_text = tf.keras.layers.experimental.preprocessing.TextVectorization()
title_text.adapt(ratings.map(lambda x: x["movie_title"]))

In [21]:
for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):
 print(title_text(row))

tf.Tensor([[ 32 266 162 2 267 265 53]], shape=(1, 7), dtype=int64)


In [22]:
title_text.get_vocabulary()[40:45]

['first', '1998', '1977', '1971', 'monty']

In [23]:
len(title_text.get_vocabulary())

2468

In [24]:
for row in ratings.batch(1).take(1):
 print(user_id_model(row['user_id']))
 print(timestamp_embedding_model(row['timestamp']))
 print(timestamp_normalization(row['timestamp']))
 print(tf.concat([user_id_model(row['user_id']),
 timestamp_embedding_model(row['timestamp']),
 timestamp_normalization(row['timestamp'])], axis=1))

tf.Tensor(
[[-0.0438923 0.00482889 0.01170977 -0.04974284 -0.02898848 0.03447617
 -0.00012434 0.01063495 -0.02815315 -0.0414476 0.03511366 -0.00849334
 -0.02312708 0.02719413 0.03063178 0.04632486 0.04097709 0.02941669
 0.04679706 -0.01792303 -0.03674388 0.0379673 0.00889906 -0.00635362
 -0.03862506 0.04349775 -0.04566013 0.03360397 0.02138766 0.02380446
 -0.02359644 0.02273068]], shape=(1, 32), dtype=float32)
tf.Tensor(
[[-0.04667547 -0.02808862 0.01682058 0.0338834 -0.03198426 0.00383388
 0.03077259 0.01907918 0.04480798 -0.03309586 -0.01344311 0.0165614
 -0.02186432 -0.01408075 0.00864227 -0.03585001 -0.00817011 0.01023756
 0.02895612 -0.040274 0.0173769 0.0120603 -0.01638366 -0.04449072
 -0.01819245 -0.03192252 -0.00627976 0.01466325 0.04082705 0.03110823
 -0.02607706 0.02907329]], shape=(1, 32), dtype=float32)
tf.Tensor([[-0.8429372]], shape=(1, 1), dtype=float32)
tf.Tensor(
[[-4.3892302e-02 4.8288926e-03 1.1709772e-02 -4.9742844e-02
 -2.8988481e-02 3.4476172e-02 -1.2433529e-04 1.

In [25]:
class UserModel(tf.keras.Model):

 def __init__(self):
 super().__init__()

 self.user_embedding = tf.keras.Sequential([
 user_id_lookup,
 tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32),
 ])
 self.timestamp_embedding = tf.keras.Sequential([
 tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
 tf.keras.layers.Embedding(len(timestamp_buckets) + 2, 32)
 ])
 self.normalized_timestamp = tf.keras.layers.experimental.preprocessing.Normalization()

 def call(self, inputs):

 # Take the input dictionary, pass it through each input layer,
 # and concatenate the result.
 return tf.concat([
 self.user_embedding(inputs["user_id"]),
 self.timestamp_embedding(inputs["timestamp"]),
 self.normalized_timestamp(inputs["timestamp"])
 ], axis=1)

In [26]:
user_model = UserModel()

user_model.normalized_timestamp.adapt(
 ratings.map(lambda x: x["timestamp"]).batch(128))

for row in ratings.batch(1).take(1):
 print(f"Computed representations: {user_model(row)[0, :3]}")





Computed representations: [-0.01393082 -0.048677 0.00579016]


In [27]:
class MovieModel(tf.keras.Model):

 def __init__(self):
 super().__init__()

 max_tokens = 10_000

 self.title_embedding = tf.keras.Sequential([
 movie_title_lookup,
 tf.keras.layers.Embedding(movie_title_lookup.vocab_size(), 32)
 ])
 self.title_text_embedding = tf.keras.Sequential([
 tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_tokens),
 tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
 # We average the embedding of individual words to get one embedding vector
 # per title.
 tf.keras.layers.GlobalAveragePooling1D(),
 ])

 def call(self, inputs):
 return tf.concat([
 self.title_embedding(inputs["movie_title"]),
 self.title_text_embedding(inputs["movie_title"]),
 ], axis=1)

In [28]:
movie_model = MovieModel()

movie_model.title_text_embedding.layers[0].adapt(
 ratings.map(lambda x: x["movie_title"]))

for row in ratings.batch(1).take(1):
 print(f"Computed representations: {movie_model(row)[0, :3]}")





Computed representations: [-0.04467254 0.00962601 -0.03479909]
