No public description

PiperOrigin-RevId: 558890434

No public description
PiperOrigin-RevId: 558890434
21158b40 · Hongkun Yu · A. Unique TensorFlower · 8bbb4841 · 8bbb4841 · 8bbb4841
16 changed file
--- a/official/projects/videoglue/configs/backbones_3d_test.py
+++ b/official/projects/videoglue/configs/backbones_3d_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for backbones_3d."""
-
-import tensorflow as tf
-from official.projects.videoglue.configs import backbones_3d
-
-
-class Backbones3DTest(tf.test.TestCase):
-
-  def test_vit_3d(self):
-    config = backbones_3d.Backbone3D(
-        type='vit_3d',
-        vit_3d=backbones_3d.VisionTransformer3D())
-    config.validate()
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/configs/dataset_test.py
+++ b/official/projects/videoglue/configs/dataset_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for dataset."""
-
-import tensorflow as tf
-from official.projects.videoglue.configs import dataset
-from official.vision.configs import common as common_cfg
-
-
-class DatasetTest(tf.test.TestCase):
-
-  def test_dataset_valid(self):
-    config = dataset.DataConfig(
-        name='dummy_set',
-        data_augmentation=dataset.DataAugmentation(
-            type='ava', ava=dataset.AVA(scale_min=0.1, scale_max=1.0)),
-        feature_shape=(1, 2, 3, 4),
-        autoaug=common_cfg.AutoAugment(),
-        randaug=common_cfg.RandAugment(),
-        mixup_cutmix=common_cfg.MixupAndCutmix())
-    config.validate()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/configs/head_test.py
+++ b/official/projects/videoglue/configs/head_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for head."""
-
-import tensorflow as tf
-from official.projects.videoglue.configs import head as head_cfg
-
-
-class HeadTest(tf.test.TestCase):
-
-  def test_mlp_head_valid(self):
-    config = head_cfg.MLP(
-        num_hidden_channels=128,
-        num_hidden_layers=4,
-        num_output_channels=1280,
-        use_sync_bn=True,
-        norm_momentum=0.99,
-        norm_epsilon=1e-5,
-        activation='relu')
-    config.validate()
-
-  def test_action_transformer_head_valid(self):
-    config = head_cfg.ActionTransformer(
-        activation='relu',
-        tx_activation='relu')
-    config.validate()
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/configs/spatiotemporal_action_localization_test.py
+++ b/official/projects/videoglue/configs/spatiotemporal_action_localization_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for spatiotemporal_action_localization."""
-
-import tensorflow as tf
-
-from official.projects.videoglue.configs import spatiotemporal_action_localization as stal
-
-
-class SpatiotemporalActionLocalizationTest(tf.test.TestCase):
-
-  def test_spatiotemporal_action_localization_config(self):
-    config = (
-        stal.spatiotemporal_action_localization())
-
-    self.assertIsInstance(
-        config.task,
-        stal.SpatiotemporalActionLocalizationTask)
-    self.assertIsInstance(
-        config.task.model,
-        stal.VideoActionTransformerModel)
-
-  def test_spatiotemporal_action_localization_vit12_config(self):
-    config = (
-        stal.spatiotemporal_action_localization_vit12())
-
-    self.assertIsInstance(
-        config.task,
-        stal.SpatiotemporalActionLocalizationTask)
-    self.assertEqual(
-        config.trainer.optimizer_config.optimizer.type, 'vit_adamw')
-
-  def test_spatiotemporal_action_localization_vit16_config(self):
-    config = (
-        stal.spatiotemporal_action_localization_vit16())
-
-    self.assertIsInstance(
-        config.task,
-        stal.SpatiotemporalActionLocalizationTask)
-    self.assertEqual(
-        config.trainer.optimizer_config.optimizer.type, 'vit_adamw')
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/configs/video_classification_test.py
+++ b/official/projects/videoglue/configs/video_classification_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for video_classification."""
-
-import tensorflow as tf
-
-from official.projects.videoglue.configs import video_classification as cfg
-
-
-class VideoCoarseClassificationTest(tf.test.TestCase):
-
-  def test_video_classification_config(self):
-    config = cfg.mh_video_classification()
-    self.assertIsInstance(config.task, cfg.MultiHeadVideoClassificationTask)
-    self.assertIsInstance(config.task.model,
-                          cfg.MultiHeadVideoClassificationModel)
-    config.task.train_data.is_training = None
-    with self.assertRaises(KeyError):
-      config.validate()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/datasets/action_localization_test.py
+++ b/official/projects/videoglue/datasets/action_localization_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for action_localization data loader."""
-import io
-from absl import logging
-import numpy as np
-from PIL import Image
-import tensorflow as tf
-
-from official.projects.videoglue.datasets import action_localization
-
-IMAGE_KEY = 'image/encoded'
-KEYFRAME_INDEX = 'clip/key_frame/frame_index'
-KEYFRAME_BOX_PREFIX = 'clip/key_frame/bbox'
-DETECTED_BOX_PREFIX = 'centernet/bbox'
-TFR_PATH = '/tmp/example.tfrecord'
-
-
-def create_fake_tfse_sstable():
-  """Creates fake data."""
-  random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
-  random_image = Image.fromarray(random_image)
-  with io.BytesIO() as buffer:
-    random_image.save(buffer, format='JPEG')
-    raw_image_bytes = buffer.getvalue()
-
-  num_frames = 4
-  tfse = tf.train.SequenceExample()
-  # keyframe index
-  tfse.context.feature.get_or_create(KEYFRAME_INDEX).int64_list.value[:] = [2]
-  # keyframe boxes
-  tfse.context.feature.get_or_create(
-      f'{KEYFRAME_BOX_PREFIX}/ymin').float_list.value[:] = [0.0, 0.1, 0.2, 0.2]
-  tfse.context.feature.get_or_create(
-      f'{KEYFRAME_BOX_PREFIX}/xmin').float_list.value[:] = [0.0, 0.1, 0.2, 0.2]
-  tfse.context.feature.get_or_create(
-      f'{KEYFRAME_BOX_PREFIX}/ymax').float_list.value[:] = [0.5, 0.6, 0.7, 0.7]
-  tfse.context.feature.get_or_create(
-      f'{KEYFRAME_BOX_PREFIX}/xmax').float_list.value[:] = [0.5, 0.6, 0.7, 0.7]
-  tfse.context.feature.get_or_create(
-      f'{KEYFRAME_BOX_PREFIX}/score').float_list.value[:] = [1.0, 1.0, 1.0, 1.0]
-  # boxes labels
-  tfse.context.feature.get_or_create(
-      f'{KEYFRAME_BOX_PREFIX}/label/index').int64_list.value[:] = [
-          0, 1, 2, 3
-      ]
-  tfse.context.feature.get_or_create(
-      f'{KEYFRAME_BOX_PREFIX}/label/string').bytes_list.value[:] = [
-          b'0', b'1', b'2', b'3',
-      ]
-
-  for i in range(num_frames):
-    # image
-    tfse.feature_lists.feature_list.get_or_create(
-        IMAGE_KEY).feature.add().bytes_list.value[:] = [raw_image_bytes]
-    # detected boxes.
-    tfse.feature_lists.feature_list.get_or_create(
-        f'{DETECTED_BOX_PREFIX}/ymin').feature.add().float_list.value[:] = [
-            0.0, 0.1, 0.2
-        ]
-    tfse.feature_lists.feature_list.get_or_create(
-        f'{DETECTED_BOX_PREFIX}/xmin').feature.add().float_list.value[:] = [
-            0.0, 0.1, 0.2
-        ]
-    tfse.feature_lists.feature_list.get_or_create(
-        f'{DETECTED_BOX_PREFIX}/ymax').feature.add().float_list.value[:] = [
-            0.5, 0.6, 0.7
-        ]
-    tfse.feature_lists.feature_list.get_or_create(
-        f'{DETECTED_BOX_PREFIX}/xmax').feature.add().float_list.value[:] = [
-            0.5, 0.6, 0.7
-        ]
-    tfse.feature_lists.feature_list.get_or_create(
-        f'{DETECTED_BOX_PREFIX}/score').feature.add().float_list.value[:] = [
-            0.91, 0.91, 0.1 * i
-        ]
-
-  writer = tf.io.TFRecordWriter(TFR_PATH)
-  writer.write(tfse.SerializeToString())
-  logging.info('Writing tfrecord table: %s', TFR_PATH)
-  writer.close()
-
-
-class ActionLocalizationTest(tf.test.TestCase):
-
-  def test_create_action_localization_dataset(self):
-    create_fake_tfse_sstable()
-    dataset_cls = action_localization.ActionLocalizationBaseFactory(
-        subset='train')
-    dataset_cls._NUM_CLASSES = 5
-    dataset_cls._ZERO_BASED_INDEX = True
-    configs = {
-        'is_training': False,
-        'num_frames': 4,
-        'temporal_stride': 1,
-        'num_instance_per_frame': 5,
-        'one_hot_label': True,
-        'merge_multi_labels': True,
-        'import_detected_bboxes': True,
-        'augmentation_type': 'ava',
-        'augmentation_params': {'scale_min': 0.0, 'scale_max': 0.0}
-    }
-    dataset_cls.configure(**configs)
-    ds = dataset_cls.make_dataset(shuffle=False, batch_size=1)
-    ds_iter = iter(ds)
-    data = next(ds_iter)
-    expected_subset = [
-        'image',
-        'keyframe_index',
-        'label',
-        'instances_position',
-        'instances_mask',
-        'instances_score',
-        'nonmerge_label',
-        'nonmerge_instances_position',
-        'detected_instances_position',
-        'detected_instances_mask',
-        'detected_instances_score',
-    ]
-    self.assertSameElements(expected_subset, data.keys())
-
-    self.assertAllEqual(data['keyframe_index'], [[2]])
-    expected_label = tf.constant(
-        [[1., 0., 0., 0., 0.],
-         [0., 1., 0., 0., 0.],
-         [0., 0., 1., 1., 0.],
-         [0., 0., 0., 0., 0.],
-         [0., 0., 0., 0., 0.]])
-    expected_label = expected_label[None, ...]
-    self.assertAllEqual(data['label'], expected_label)
-
-    expected_instances_mask = tf.constant([True, True, True, False, False])
-    expected_instances_mask = expected_instances_mask[None, :]
-    self.assertAllEqual(data['instances_mask'], expected_instances_mask)
-
-    expected_nonmerge_label = tf.constant([0, 1, 2, 3, -1])
-    expected_nonmerge_label = expected_nonmerge_label[None, :]
-    self.assertAllEqual(data['nonmerge_label'], expected_nonmerge_label)
-
-    self.assertAllEqual(data['detected_instances_position'].shape, [1, 5, 4])
-    self.assertAllEqual(data['detected_instances_mask'].shape, [1, 5])
-    expected_detected_instances_mask = tf.constant(
-        [[True, True, False, False, False]])
-    self.assertAllEqual(data['detected_instances_mask'],
-                        expected_detected_instances_mask)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/datasets/video_classification_test.py
+++ b/official/projects/videoglue/datasets/video_classification_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for video_classification."""
-import io
-from absl import logging
-import numpy as np
-from PIL import Image
-import tensorflow as tf
-
-from official.projects.videoglue.datasets import video_classification
-from official.vision.configs import common as common_cfg
-
-
-IMAGE_KEY = 'image/encoded'
-LABEL_KEY = 'clip/label/index'
-TFR_PATH = '/tmp/sample.tfrecord'
-
-
-def create_fake_tfse_sstable():
-  """Creates fake data."""
-  num_frames = 25
-  tfse = tf.train.SequenceExample()
-  tfse.context.feature.get_or_create(LABEL_KEY).int64_list.value[:] = [0]
-  for frame_id in range(num_frames):
-    image = np.ones((263, 320, 3), dtype=np.uint8) * frame_id
-    image = Image.fromarray(image)
-    with io.BytesIO() as buffer:
-      image.save(buffer, format='JPEG')
-      raw_image_bytes = buffer.getvalue()
-
-    tfse.feature_lists.feature_list.get_or_create(
-        IMAGE_KEY).feature.add().bytes_list.value[:] = [raw_image_bytes]
-
-  writer = tf.io.TFRecordWriter(TFR_PATH)
-  writer.write(tfse.SerializeToString())
-  logging.info('Writing tfrecord table: %s', TFR_PATH)
-  writer.close()
-
-
-class VideoClassificationTest(tf.test.TestCase):
-
-  def test_create_video_classification_data(self):
-    create_fake_tfse_sstable()
-    dataset_cls = video_classification.VideoClassificationBaseFactory(
-        subset='train')
-    configs = {
-        'is_training': True,
-        'num_frames': 4,
-        'one_hot_label': True,
-    }
-    dataset_cls.configure(**configs)
-    ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
-    ds_iter = iter(ds)
-    data = next(ds_iter)
-    self.assertAllEqual(data['label'].shape, [2, 400])
-    self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
-
-  def test_video_classification_randaug(self):
-    create_fake_tfse_sstable()
-    dataset_cls = video_classification.VideoClassificationBaseFactory(
-        subset='train')
-    configs = {
-        'is_training': True,
-        'num_frames': 4,
-        'one_hot_label': True,
-        'randaug_params': common_cfg.RandAugment().as_dict(),
-    }
-    dataset_cls.configure(**configs)
-    ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
-    ds_iter = iter(ds)
-    data = next(ds_iter)
-    self.assertAllEqual(data['label'].shape, [2, 400])
-    self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
-
-  def test_video_classification_autoaug(self):
-    create_fake_tfse_sstable()
-    dataset_cls = video_classification.VideoClassificationBaseFactory(
-        subset='train')
-    configs = {
-        'is_training': True,
-        'num_frames': 4,
-        'one_hot_label': True,
-        'autoaug_params': common_cfg.AutoAugment().as_dict(),
-    }
-    dataset_cls.configure(**configs)
-    ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
-    ds_iter = iter(ds)
-    data = next(ds_iter)
-    self.assertAllEqual(data['label'].shape, [2, 400])
-    self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
-
-  def test_video_classification_mixup_cutmix(self):
-    create_fake_tfse_sstable()
-    dataset_cls = video_classification.VideoClassificationBaseFactory(
-        subset='train')
-    configs = {
-        'is_training': True,
-        'num_frames': 4,
-        'one_hot_label': True,
-        'mixup_cutmix_params': common_cfg.MixupAndCutmix().as_dict(),
-    }
-    dataset_cls.configure(**configs)
-    ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
-    ds_iter = iter(ds)
-    data = next(ds_iter)
-    self.assertAllEqual(data['label'].shape, [2, 400])
-    self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
-
-  def test_video_classification_sample_segments(self):
-    create_fake_tfse_sstable()
-    dataset_cls = video_classification.VideoClassificationBaseFactory(
-        subset='train')
-    configs = {
-        'is_training': False,
-        'num_frames': 5,
-        'temporal_stride': 1,
-        'sample_from_segments': True,
-        'one_hot_label': True,
-        'mixup_cutmix_params': common_cfg.MixupAndCutmix().as_dict(),
-    }
-    dataset_cls.configure(**configs)
-    ds = dataset_cls.make_dataset(shuffle=False, batch_size=1)
-    ds_iter = iter(ds)
-    data = next(ds_iter)
-    self.assertAllEqual(data['label'].shape, [1, 400])
-    self.assertAllEqual(data['image'].shape, [1, 5, 224, 224, 3])
-    average_image = tf.reduce_mean(data['image'] * 255., axis=[2, 3, 4])
-    self.assertAllEqual(average_image[0].numpy(), [2.0, 7.0, 12.0, 16.0, 21.0])
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/evaluation/spatiotemporal_action_localization_evaluator_test.py
+++ b/official/projects/videoglue/evaluation/spatiotemporal_action_localization_evaluator_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for spatiotemporal_action_localization_evaluator."""
-import tensorflow as tf
-
-from official.projects.videoglue.evaluation import spatiotemporal_action_localization_evaluator as eval_util
-
-
-class SpatiotemporalActionLocalizationEvaluatorTest(tf.test.TestCase):
-
-  def _create_test_data_simple(self):
-    boxes = tf.convert_to_tensor(
-        [[[0.1, 0.15, 0.2, 0.25], [0.35, 0.18, 0.43, 0.4],
-          [0.2, 0.1, 0.3, 0.2], [0.65, 0.55, 0.75, 0.85]],
-         [[0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9],
-          [0.1, 0.4, 0.5, 0.7], [0.04, 0.05, 0.88, 0.77]]], dtype=tf.float32)
-    nonmerge_boxes = boxes
-
-    classes = tf.convert_to_tensor([[0, 2, 3, 4], [11, 12, 13, 14]],
-                                   dtype=tf.int32)
-    predictions = tf.one_hot(classes, depth=80)
-    data = {
-        'instances_position': boxes,
-        'nonmerge_instances_position': nonmerge_boxes,
-        'predictions': predictions,
-        'nonmerge_label': classes,
-    }
-    return data
-
-  def _create_test_data_complex(self):
-    nonmerge_boxes = tf.convert_to_tensor(
-        [[[0.1, 0.15, 0.2, 0.25], [0.1, 0.15, 0.2, 0.25],
-          [0.2, 0.1, 0.3, 0.2], [0.65, 0.55, 0.75, 0.85]],
-         [[0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9],
-          [0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9]]], dtype=tf.float32)
-    boxes = tf.convert_to_tensor(
-        [[[0.1, 0.15, 0.2, 0.25], [0.2, 0.1, 0.3, 0.2],
-          [0.65, 0.55, 0.75, 0.85], [-1, -1, -1, -1]],
-         [[0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9],
-          [-1, -1, -1, -1], [-1, -1, -1, -1]]], dtype=tf.float32)
-
-    classes = tf.convert_to_tensor([[0, 2, 3, 4], [11, 12, 13, 14]],
-                                   dtype=tf.int32)
-    predictions = tf.one_hot(classes, depth=80)
-
-    data = {
-        'instances_position': boxes,
-        'nonmerge_instances_position': nonmerge_boxes,
-        'predictions': predictions,
-        'nonmerge_label': classes,
-    }
-    return data
-
-  def test_action_localization_eval_simple(self):
-    data = self._create_test_data_simple()
-    evaluator = eval_util.SpatiotemporalActionLocalizationEvaluator()
-    evaluator.reset_states()
-    evaluator.update_state(data)
-    metrics = evaluator.result()
-    self.assertAlmostEqual(metrics['mAP@.5IOU'], 1.0)
-
-  def test_action_localization_eval_complex(self):
-    data = self._create_test_data_complex()
-    evaluator = eval_util.SpatiotemporalActionLocalizationEvaluator()
-    evaluator.reset_states()
-    evaluator.update_state(data)
-    metrics = evaluator.result()
-    self.assertAlmostEqual(metrics['mAP@.5IOU'], 0.64375)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/modeling/backbones/vit_3d_test.py
+++ b/official/projects/videoglue/modeling/backbones/vit_3d_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for vit_3d."""
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.projects.videoglue.modeling.backbones import vit_3d
-
-
-class Vit3DTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (8, 224, 87718656),
-      (16, 256, 88204032),
-  )
-  def test_network_creation(self, num_frames, input_size, params_count):
-    """Test creation of VisionTransformer family models."""
-    tf.keras.backend.set_image_data_format('channels_last')
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[2, num_frames, input_size, input_size, 3])
-    network = vit_3d.VisionTransformer3D(input_specs=input_specs)
-
-    inputs = tf.keras.Input(
-        shape=(num_frames, input_size, input_size, 3), batch_size=1)
-    _ = network(inputs)
-    self.assertEqual(network.count_params(), params_count)
-
-  def test_network_none_pooler(self):
-    """Tests creation of VisionTransformer family models."""
-    tf.keras.backend.set_image_data_format('channels_last')
-    num_frames = 8
-    input_size = 224
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[2, num_frames, input_size, input_size, 3])
-    network = vit_3d.VisionTransformer3D(
-        input_specs=input_specs,
-        pooler='none',
-        representation_size=128)
-
-    inputs = tf.keras.Input(
-        shape=(num_frames, input_size, input_size, 3), batch_size=1)
-    endpoints = network(inputs)
-    self.assertEqual(endpoints['encoded_tokens'].shape, [1, 2, 14, 14, 128])
-
-  @parameterized.parameters('native', 'mae')
-  def test_network_convention(self, variant):
-    """Tests creation of VisionTransformer family models."""
-    tf.keras.backend.set_image_data_format('channels_last')
-    num_frames = 8
-    input_size = 224
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[2, num_frames, input_size, input_size, 3])
-    network = vit_3d.VisionTransformer3D(
-        variant=variant,
-        input_specs=input_specs,
-        pooler='none',
-        representation_size=128)
-
-    inputs = tf.keras.Input(
-        shape=(num_frames, input_size, input_size, 3), batch_size=1)
-    endpoints = network(inputs)
-    self.assertEqual(endpoints['encoded_tokens'].shape, [1, 2, 14, 14, 128])
-
-  def test_network_pos_embed_interpolation_mae(self):
-    """Tests creation of VisionTransformer family models."""
-    tf.keras.backend.set_image_data_format('channels_last')
-    variant = 'mae'
-    pos_embed_shape = (8, 14, 14)
-    num_frames = 8
-    input_size = 256
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[2, num_frames, input_size, input_size, 3])
-    network = vit_3d.VisionTransformer3D(
-        variant=variant,
-        input_specs=input_specs,
-        pooler='none',
-        representation_size=128,
-        pos_embed_shape=pos_embed_shape)
-
-    inputs = tf.keras.Input(
-        shape=(num_frames, input_size, input_size, 3), batch_size=1)
-    endpoints = network(inputs)
-    self.assertEqual(endpoints['encoded_tokens'].shape, [1, 2, 16, 16, 128])
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/modeling/heads/action_transformer_test.py
+++ b/official/projects/videoglue/modeling/heads/action_transformer_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for action_transformer."""
-import tensorflow as tf
-
-from official.projects.videoglue.modeling.heads import action_transformer
-
-
-class ActionTransformerTest(tf.test.TestCase):
-
-  def test_action_transformer_head_construction(self):
-    head = action_transformer.ActionTransformerHead(
-        num_hidden_layers=1,
-        num_hidden_channels=1024,
-        use_sync_bn=False,
-        num_classes=80,
-        # parameters for TxDecoder
-        num_tx_channels=128,
-        num_tx_layers=3,
-        num_tx_heads=3,
-        use_positional_embedding=True)
-
-    inputs = {
-        'features': tf.ones([2, 4, 16, 16, 128]),
-        'instances_position': tf.random.uniform([2, 6, 4]),
-    }
-
-    outputs = head(inputs, training=False)
-    self.assertAllEqual(outputs.shape, [2, 6, 80])
-
-  def test_action_transformer_linear_head_construction(self):
-    head = action_transformer.ActionTransformerHead(
-        num_hidden_layers=0,
-        num_hidden_channels=1024,
-        use_sync_bn=False,
-        num_classes=80,
-        dropout_rate=0.5,
-        # parameters for TxDecoder
-        num_tx_channels=128,
-        num_tx_layers=0,
-        num_tx_heads=3,
-        attention_dropout_rate=0.2,
-        use_positional_embedding=False)
-
-    inputs = {
-        'features': tf.ones([2, 4, 16, 16, 128]),
-        'instances_position': tf.random.uniform([2, 6, 4]),
-    }
-
-    outputs = head(inputs, training=False)
-    self.assertAllEqual(outputs.shape, [2, 6, 80])
-    trainable_weight_names = [w.name for w in head.weights]
-    expected_weight_names = [
-        'action_transformer_head/mlp/dense/kernel:0',
-        'action_transformer_head/mlp/dense/bias:0'
-    ]
-    self.assertCountEqual(trainable_weight_names, expected_weight_names)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/modeling/heads/simple_test.py
+++ b/official/projects/videoglue/modeling/heads/simple_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for simple."""
-
-import numpy as np
-import tensorflow as tf
-
-from official.projects.videoglue.modeling.heads import simple
-
-
-class SimpleTest(tf.test.TestCase):
-
-  def test_mlp_construction(self):
-    mlp_head = simple.MLP(
-        num_hidden_layers=3,
-        num_hidden_channels=128,
-        num_output_channels=56,
-        use_sync_bn=False,
-        activation='relu')
-    inputs = tf.zeros([2, 512])
-    outputs = mlp_head(inputs, training=False)
-
-    num_params = np.sum(
-        [np.prod(v.get_shape()) for v in mlp_head.trainable_weights])
-    self.assertEqual(num_params, 106296)
-    self.assertAllEqual(outputs.shape, [2, 56])
-
-  def test_att_pooler_classifier_construction(self):
-    pooler_head = simple.AttentionPoolerClassificationHead(
-        num_heads=6,
-        hidden_size=768,
-        num_classes=3)
-    inputs = tf.zeros([2, 16, 768])
-    inputs = tf.reshape(inputs, (2, 4, 4, 768))
-    outputs = pooler_head(inputs, training=False)
-    self.assertAllEqual(outputs.shape, [2, 3])
-
-  def test_att_pooler_classifier_construction_with_posembed(self):
-    pooler_head = simple.AttentionPoolerClassificationHead(
-        num_heads=6,
-        hidden_size=768,
-        num_classes=3,
-        add_temporal_pos_embed=True)
-    inputs = tf.zeros([2, 16, 768])
-    inputs = tf.reshape(inputs, (2, 4, 4, 768))
-    outputs = pooler_head(inputs, training=False)
-    self.assertAllEqual(outputs.shape, [2, 3])
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/modeling/heads/transformer_decoder_test.py
+++ b/official/projects/videoglue/modeling/heads/transformer_decoder_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for TransformerDecoder."""
-
-import tensorflow as tf
-
-from official.projects.videoglue.modeling.heads import transformer_decoder
-
-
-class TransformerTest(tf.test.TestCase):
-
-  def test_decoder_unit_return_shape(self):
-    decoder_unit = transformer_decoder.DecoderUnit(
-        num_channels=128,
-        use_bias=True,
-        dropout_rate=0.5,
-        activation='relu',
-        layer_norm_epsilon=1e-7)
-    batch_size = 16
-    num_inputs = 128
-    num_channels = 256
-    input_tensor = tf.zeros([batch_size, num_inputs, num_channels])
-    memory_tensor = tf.ones([batch_size, num_inputs * 4, num_channels])
-    outputs = decoder_unit(input_tensor, memory_tensor, training=False)
-    self.assertAllEqual(outputs['hidden_states'].shape,
-                        [batch_size, num_inputs, num_inputs])
-
-    self.assertAllEqual(outputs['attention_weights'].shape,
-                        [batch_size, num_inputs, 4 * num_inputs])
-
-  def test_decoder_layer_return_shape(self):
-    decoder_layer = transformer_decoder.TransformerDecoderLayer(
-        num_channels=128,
-        num_heads=3,
-        use_bias=True,
-        dropout_rate=0.5,
-        activation='relu',
-        layer_norm_epsilon=1e-7)
-    batch_size = 16
-    num_inputs = 128
-    num_channels = 256
-    input_tensor = tf.zeros([batch_size, num_inputs, num_channels])
-    memory_tensor = tf.ones([batch_size, num_inputs * 4, num_channels])
-    outputs = decoder_layer(input_tensor, memory_tensor, training=False)
-    self.assertAllEqual(outputs['hidden_states'].shape,
-                        [batch_size, num_inputs, num_inputs * 3])
-
-    self.assertAllEqual(outputs['attention_weights'][-1].shape,
-                        [batch_size, num_inputs, 4 * num_inputs])
-
-  def test_decoder_return_shape(self):
-    decoder = transformer_decoder.TransformerDecoder(
-        num_channels=128,
-        num_layers=5,
-        num_heads=3,
-        use_bias=True,
-        dropout_rate=0.5,
-        activation='relu',
-        layer_norm_epsilon=1e-7)
-    batch_size = 16
-    num_inputs = 128
-    num_channels = 256
-    input_tensor = tf.zeros([batch_size, num_inputs, num_channels])
-    memory_tensor = tf.ones([batch_size, num_inputs * 4, num_channels])
-    outputs = decoder(input_tensor, memory_tensor, training=False)
-    self.assertLen(outputs['attention_weights'], 5)
-    self.assertAllEqual(outputs['hidden_states'][-1].shape,
-                        [batch_size, num_inputs, num_inputs * 3])
-    self.assertAllEqual(outputs['attention_weights'][-1][-1].shape,
-                        [batch_size, num_inputs, 4 * num_inputs])
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/modeling/video_action_transformer_model_test.py
+++ b/official/projects/videoglue/modeling/video_action_transformer_model_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for video_action_transformer_model."""
-
-import tensorflow as tf
-
-from official.projects.videoglue.configs import spatiotemporal_action_localization as cfg
-from official.projects.videoglue.modeling import video_action_transformer_model
-
-
-class VideoActionTransformerModelTest(tf.test.TestCase):
-
-  def test_video_action_transformer_model_construction(self):
-    model_config = cfg.VideoActionTransformerModel()
-    input_specs = {
-        'image': tf.keras.layers.InputSpec(shape=[None, 4, 20, 20, 3]),
-        'instances_position': tf.keras.layers.InputSpec(shape=[None, 8, 4])
-    }
-
-    model = video_action_transformer_model.build_video_action_transformer_model(
-        input_specs_dict=input_specs,
-        model_config=model_config,
-        num_classes=80)
-    self.assertIsInstance(
-        model, video_action_transformer_model.VideoActionTransformerModel)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/modeling/video_classification_model_test.py
+++ b/official/projects/videoglue/modeling/video_classification_model_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for video classification network."""
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.projects.videoglue.modeling import video_classification_model
-from official.vision.modeling import backbones
-
-
-class MultiHeadsVideoClassificationNetworkTest(
-    parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      (50, 8, 112, 'relu', False),
-      (50, 8, 112, 'swish', True),
-  )
-  def test_resnet3d_network_creation(self, model_id, temporal_size,
-                                     spatial_size, activation,
-                                     aggregate_endpoints):
-    """Test for creation of a ResNet3D-50 classifier."""
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[None, temporal_size, spatial_size, spatial_size, 3])
-    temporal_strides = [1, 1, 1, 1]
-    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
-                             (1, 3, 1)]
-
-    tf.keras.backend.set_image_data_format('channels_last')
-
-    backbone = backbones.ResNet3D(
-        model_id=model_id,
-        temporal_strides=temporal_strides,
-        temporal_kernel_sizes=temporal_kernel_sizes,
-        input_specs=input_specs,
-        activation=activation)
-
-    num_classes = 1000
-    model = video_classification_model.MultiHeadVideoClassificationModel(
-        backbone=backbone,
-        num_classes=num_classes,
-        input_specs={'image': input_specs},
-        dropout_rate=0.2,
-        aggregate_endpoints=aggregate_endpoints,
-    )
-
-    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
-    logits = model(inputs)
-    self.assertAllEqual([2, num_classes], logits.numpy().shape)
-
-  @parameterized.parameters(
-      (50, 8, 112, 'relu', False),
-      (50, 8, 112, 'swish', False),
-  )
-  def test_resnet3d_network_pooler_head_creation(
-      self, model_id, temporal_size, spatial_size, activation,
-      aggregate_endpoints):
-    """Test for creation of a ResNet3D-50 classifier."""
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[None, temporal_size, spatial_size, spatial_size, 3])
-    temporal_strides = [1, 1, 1, 1]
-    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
-                             (1, 3, 1)]
-
-    tf.keras.backend.set_image_data_format('channels_last')
-
-    backbone = backbones.ResNet3D(
-        model_id=model_id,
-        temporal_strides=temporal_strides,
-        temporal_kernel_sizes=temporal_kernel_sizes,
-        input_specs=input_specs,
-        activation=activation)
-
-    num_classes = 1000
-    model = video_classification_model.MultiHeadVideoClassificationModel(
-        backbone=backbone,
-        num_classes=num_classes,
-        input_specs={'image': input_specs},
-        dropout_rate=0.2,
-        aggregate_endpoints=aggregate_endpoints,
-        classifier_type='pooler')
-
-    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
-    logits = model(inputs)
-    self.assertAllEqual([2, num_classes], logits.numpy().shape)
-
-  @parameterized.parameters(
-      (50, 8, 112, 'relu', False),
-      (50, 8, 112, 'swish', True),
-  )
-  def test_resnet3d_network_multiheads_creation(
-      self, model_id, temporal_size, spatial_size, activation,
-      aggregate_endpoints):
-    """Test for creation of a ResNet3D-50 multiheads classifier."""
-    input_specs = tf.keras.layers.InputSpec(
-        shape=[None, temporal_size, spatial_size, spatial_size, 3])
-    temporal_strides = [1, 1, 1, 1]
-    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
-                             (1, 3, 1)]
-
-    tf.keras.backend.set_image_data_format('channels_last')
-
-    backbone = backbones.ResNet3D(
-        model_id=model_id,
-        temporal_strides=temporal_strides,
-        temporal_kernel_sizes=temporal_kernel_sizes,
-        input_specs=input_specs,
-        activation=activation)
-
-    num_classes = [1000, 100, 10]
-    model = video_classification_model.MultiHeadVideoClassificationModel(
-        backbone=backbone,
-        num_classes=num_classes,
-        input_specs={'image': input_specs},
-        dropout_rate=0.2,
-        aggregate_endpoints=aggregate_endpoints,
-    )
-
-    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
-    logits = model(inputs)
-    for i in range(3):
-      self.assertAllEqual([2, num_classes[i]], logits[i].numpy().shape)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/tasks/multihead_video_classification_test.py
+++ b/official/projects/videoglue/tasks/multihead_video_classification_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for video_classification."""
-import tensorflow as tf
-
-# pylint: disable=unused-import
-from official.modeling import optimization
-from official.projects.videoglue.configs import video_classification as exp_cfg
-from official.projects.videoglue.modeling import video_classification_model
-from official.projects.videoglue.modeling.backbones import vit_3d
-from official.projects.videoglue.tasks import multihead_video_classification
-# pylint: enable=unused-import
-
-
-class MultiheadVideoClassificationTest(tf.test.TestCase):
-
-  def test_one_head_video_classification(self):
-    config = exp_cfg.mh_video_classification()
-    config.task.train_data.global_batch_size = 2
-    config.task.train_data.num_classes = 400
-    config.task.validation_data.num_classes = 400
-    config.task.train_data.feature_shape = (16, 56, 56, 3)
-    config.task.validation_data.feature_shape = (16, 56, 56, 3)
-
-    task = multihead_video_classification.MultiHeadVideoClassificationTask(
-        config.task)
-    model = task.build_model()
-    metrics = task.build_metrics()
-
-    data_inputs = {
-        'image': tf.ones([2, 16, 56, 56, 3], tf.float32),
-        'label': tf.ones([2, 400], tf.float32),
-    }
-
-    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
-    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
-    logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
-
-    self.assertIn('loss', logs)
-    self.assertIn('label/accuracy', logs)
-    self.assertIn('label/top_1_accuracy', logs)
-    self.assertIn('label/top_5_accuracy', logs)
-    logs = task.validation_step(data_inputs, model, metrics=metrics)
-    self.assertIn('loss', logs)
-    self.assertIn('label/accuracy', logs)
-    self.assertIn('label/top_1_accuracy', logs)
-    self.assertIn('label/top_5_accuracy', logs)
-
-  def test_one_head_video_classification_multilabel(self):
-    config = exp_cfg.mh_video_classification()
-    config.task.train_data.global_batch_size = 2
-    config.task.train_data.num_classes = 400
-    config.task.train_data.is_multilabel = True
-    config.task.validation_data.num_classes = 400
-    config.task.train_data.feature_shape = (16, 56, 56, 3)
-    config.task.validation_data.feature_shape = (16, 56, 56, 3)
-    config.task.validation_data.is_multilabel = True
-
-    task = multihead_video_classification.MultiHeadVideoClassificationTask(
-        config.task)
-    model = task.build_model()
-    metrics = task.build_metrics()
-
-    data_inputs = {
-        'image': tf.ones([2, 16, 56, 56, 3], tf.float32),
-        'label': tf.ones([2, 400], tf.float32),
-    }
-
-    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
-    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
-    logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
-
-    self.assertIn('loss', logs)
-    self.assertIn('label/ROC-AUC', logs)
-    self.assertIn('label/PR-AUC', logs)
-    self.assertIn('label/RecallAtPrecision95', logs)
-    logs = task.validation_step(data_inputs, model, metrics=metrics)
-    self.assertIn('loss', logs)
-    self.assertIn('label/ROC-AUC', logs)
-    self.assertIn('label/PR-AUC', logs)
-    self.assertIn('label/RecallAtPrecision95', logs)
-
-  def test_multi_head_video_classification(self):
-    config = exp_cfg.mh_video_classification()
-    config.task.train_data.global_batch_size = 2
-    config.task.train_data.num_classes = [123, 456]
-    config.task.train_data.label_names = ['label_a', 'label_b']
-    config.task.validation_data.num_classes = [123, 456]
-    config.task.validation_data.label_names = ['label_a', 'label_b']
-    config.task.train_data.feature_shape = (16, 56, 56, 3)
-    config.task.validation_data.feature_shape = (16, 56, 56, 3)
-
-    task = multihead_video_classification.MultiHeadVideoClassificationTask(
-        config.task)
-    model = task.build_model()
-    metrics = task.build_metrics()
-
-    data_inputs = {
-        'image': tf.ones([2, 16, 56, 56, 3], tf.float32),
-        'label_a': tf.ones([2, 123], tf.float32),
-        'label_b': tf.ones([2, 456], tf.float32),
-    }
-
-    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
-    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
-    logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
-
-    self.assertIn('loss', logs)
-    self.assertIn('label_a/accuracy', logs)
-    self.assertIn('label_a/top_1_accuracy', logs)
-    self.assertIn('label_a/top_5_accuracy', logs)
-    self.assertIn('label_b/accuracy', logs)
-    self.assertIn('label_b/top_1_accuracy', logs)
-    self.assertIn('label_b/top_5_accuracy', logs)
-    self.assertIn('label_joint/accuracy', logs)
-
-    logs = task.validation_step(data_inputs, model, metrics=metrics)
-    self.assertIn('loss', logs)
-    self.assertIn('label_a/accuracy', logs)
-    self.assertIn('label_a/top_1_accuracy', logs)
-    self.assertIn('label_a/top_5_accuracy', logs)
-    self.assertIn('label_b/accuracy', logs)
-    self.assertIn('label_b/top_1_accuracy', logs)
-    self.assertIn('label_b/top_5_accuracy', logs)
-    self.assertIn('label_joint/accuracy', logs)
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/projects/videoglue/tasks/spatiotemporal_action_localization_test.py
+++ b/official/projects/videoglue/tasks/spatiotemporal_action_localization_test.py
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for spatiotemporal_action_localization."""
-import tensorflow as tf
-
-from official.core import exp_factory
-from official.modeling import optimization
-from official.projects.videoglue.modeling import video_action_transformer_model  # pylint: disable=unused-import
-from official.projects.videoglue.tasks import spatiotemporal_action_localization as stal_task
-
-
-class SpatiotemporalActionLocalizationTest(tf.test.TestCase):
-
-  def test_spatiotemporal_action_localization(self):
-    config = exp_factory.get_exp_config('spatiotemporal_action_localization')
-    config.task.train_data.global_batch_size = 2
-    config.task.train_data.feature_shape = (32, 56, 56, 3)
-    config.task.validation_data.global_batch_size = 2
-    config.task.validation_data.feature_shape = (32, 56, 56, 3)
-    config.task.losses.l2_weight_decay = 1e-7
-
-    task = stal_task.SpatiotemporalActionLocalizationTask(config.task)
-    model = task.build_model()
-    metrics = task.build_metrics()
-
-    data_inputs = {
-        'image': tf.ones([2, 32, 56, 56, 3], tf.float32),
-        'instances_position': tf.ones([2, 32, 4], tf.float32),
-        'instances_score': tf.ones([2, 32], tf.float32),
-        'instances_mask': tf.ones([2, 32], tf.float32),
-        'label': tf.ones([2, 32, 80], tf.float32),
-        'nonmerge_label': tf.ones([2, 32, 80], tf.float32),
-        'nonmerge_instances_position': tf.ones([2, 32, 4], tf.float32),
-    }
-    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
-    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
-    logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
-    self.assertIn('loss', logs)
-    self.assertIn('model_loss', logs)
-    self.assertIn('regularization_loss', logs)
-
-    logs = task.validation_step(data_inputs, model, metrics=metrics)
-    self.assertIn('loss', logs)
-    self.assertIn('model_loss', logs)
-    self.assertIn('regularization_loss', logs)
-    self.assertIn('nonmerge_label', logs)
-    self.assertIn('nonmerge_instances_position', logs)
-
-
-if __name__ == '__main__':
-  tf.test.main()