提交 21158b40 编写于 作者: H Hongkun Yu 提交者: A. Unique TensorFlower

No public description

PiperOrigin-RevId: 558890434
上级 8bbb4841
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for backbones_3d."""
import tensorflow as tf
from official.projects.videoglue.configs import backbones_3d
class Backbones3DTest(tf.test.TestCase):
def test_vit_3d(self):
config = backbones_3d.Backbone3D(
type='vit_3d',
vit_3d=backbones_3d.VisionTransformer3D())
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for dataset."""
import tensorflow as tf
from official.projects.videoglue.configs import dataset
from official.vision.configs import common as common_cfg
class DatasetTest(tf.test.TestCase):
def test_dataset_valid(self):
config = dataset.DataConfig(
name='dummy_set',
data_augmentation=dataset.DataAugmentation(
type='ava', ava=dataset.AVA(scale_min=0.1, scale_max=1.0)),
feature_shape=(1, 2, 3, 4),
autoaug=common_cfg.AutoAugment(),
randaug=common_cfg.RandAugment(),
mixup_cutmix=common_cfg.MixupAndCutmix())
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for head."""
import tensorflow as tf
from official.projects.videoglue.configs import head as head_cfg
class HeadTest(tf.test.TestCase):
def test_mlp_head_valid(self):
config = head_cfg.MLP(
num_hidden_channels=128,
num_hidden_layers=4,
num_output_channels=1280,
use_sync_bn=True,
norm_momentum=0.99,
norm_epsilon=1e-5,
activation='relu')
config.validate()
def test_action_transformer_head_valid(self):
config = head_cfg.ActionTransformer(
activation='relu',
tx_activation='relu')
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for spatiotemporal_action_localization."""
import tensorflow as tf
from official.projects.videoglue.configs import spatiotemporal_action_localization as stal
class SpatiotemporalActionLocalizationTest(tf.test.TestCase):
def test_spatiotemporal_action_localization_config(self):
config = (
stal.spatiotemporal_action_localization())
self.assertIsInstance(
config.task,
stal.SpatiotemporalActionLocalizationTask)
self.assertIsInstance(
config.task.model,
stal.VideoActionTransformerModel)
def test_spatiotemporal_action_localization_vit12_config(self):
config = (
stal.spatiotemporal_action_localization_vit12())
self.assertIsInstance(
config.task,
stal.SpatiotemporalActionLocalizationTask)
self.assertEqual(
config.trainer.optimizer_config.optimizer.type, 'vit_adamw')
def test_spatiotemporal_action_localization_vit16_config(self):
config = (
stal.spatiotemporal_action_localization_vit16())
self.assertIsInstance(
config.task,
stal.SpatiotemporalActionLocalizationTask)
self.assertEqual(
config.trainer.optimizer_config.optimizer.type, 'vit_adamw')
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for video_classification."""
import tensorflow as tf
from official.projects.videoglue.configs import video_classification as cfg
class VideoCoarseClassificationTest(tf.test.TestCase):
def test_video_classification_config(self):
config = cfg.mh_video_classification()
self.assertIsInstance(config.task, cfg.MultiHeadVideoClassificationTask)
self.assertIsInstance(config.task.model,
cfg.MultiHeadVideoClassificationModel)
config.task.train_data.is_training = None
with self.assertRaises(KeyError):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for action_localization data loader."""
import io
from absl import logging
import numpy as np
from PIL import Image
import tensorflow as tf
from official.projects.videoglue.datasets import action_localization
IMAGE_KEY = 'image/encoded'
KEYFRAME_INDEX = 'clip/key_frame/frame_index'
KEYFRAME_BOX_PREFIX = 'clip/key_frame/bbox'
DETECTED_BOX_PREFIX = 'centernet/bbox'
TFR_PATH = '/tmp/example.tfrecord'
def create_fake_tfse_sstable():
"""Creates fake data."""
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
with io.BytesIO() as buffer:
random_image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
num_frames = 4
tfse = tf.train.SequenceExample()
# keyframe index
tfse.context.feature.get_or_create(KEYFRAME_INDEX).int64_list.value[:] = [2]
# keyframe boxes
tfse.context.feature.get_or_create(
f'{KEYFRAME_BOX_PREFIX}/ymin').float_list.value[:] = [0.0, 0.1, 0.2, 0.2]
tfse.context.feature.get_or_create(
f'{KEYFRAME_BOX_PREFIX}/xmin').float_list.value[:] = [0.0, 0.1, 0.2, 0.2]
tfse.context.feature.get_or_create(
f'{KEYFRAME_BOX_PREFIX}/ymax').float_list.value[:] = [0.5, 0.6, 0.7, 0.7]
tfse.context.feature.get_or_create(
f'{KEYFRAME_BOX_PREFIX}/xmax').float_list.value[:] = [0.5, 0.6, 0.7, 0.7]
tfse.context.feature.get_or_create(
f'{KEYFRAME_BOX_PREFIX}/score').float_list.value[:] = [1.0, 1.0, 1.0, 1.0]
# boxes labels
tfse.context.feature.get_or_create(
f'{KEYFRAME_BOX_PREFIX}/label/index').int64_list.value[:] = [
0, 1, 2, 3
]
tfse.context.feature.get_or_create(
f'{KEYFRAME_BOX_PREFIX}/label/string').bytes_list.value[:] = [
b'0', b'1', b'2', b'3',
]
for i in range(num_frames):
# image
tfse.feature_lists.feature_list.get_or_create(
IMAGE_KEY).feature.add().bytes_list.value[:] = [raw_image_bytes]
# detected boxes.
tfse.feature_lists.feature_list.get_or_create(
f'{DETECTED_BOX_PREFIX}/ymin').feature.add().float_list.value[:] = [
0.0, 0.1, 0.2
]
tfse.feature_lists.feature_list.get_or_create(
f'{DETECTED_BOX_PREFIX}/xmin').feature.add().float_list.value[:] = [
0.0, 0.1, 0.2
]
tfse.feature_lists.feature_list.get_or_create(
f'{DETECTED_BOX_PREFIX}/ymax').feature.add().float_list.value[:] = [
0.5, 0.6, 0.7
]
tfse.feature_lists.feature_list.get_or_create(
f'{DETECTED_BOX_PREFIX}/xmax').feature.add().float_list.value[:] = [
0.5, 0.6, 0.7
]
tfse.feature_lists.feature_list.get_or_create(
f'{DETECTED_BOX_PREFIX}/score').feature.add().float_list.value[:] = [
0.91, 0.91, 0.1 * i
]
writer = tf.io.TFRecordWriter(TFR_PATH)
writer.write(tfse.SerializeToString())
logging.info('Writing tfrecord table: %s', TFR_PATH)
writer.close()
class ActionLocalizationTest(tf.test.TestCase):
def test_create_action_localization_dataset(self):
create_fake_tfse_sstable()
dataset_cls = action_localization.ActionLocalizationBaseFactory(
subset='train')
dataset_cls._NUM_CLASSES = 5
dataset_cls._ZERO_BASED_INDEX = True
configs = {
'is_training': False,
'num_frames': 4,
'temporal_stride': 1,
'num_instance_per_frame': 5,
'one_hot_label': True,
'merge_multi_labels': True,
'import_detected_bboxes': True,
'augmentation_type': 'ava',
'augmentation_params': {'scale_min': 0.0, 'scale_max': 0.0}
}
dataset_cls.configure(**configs)
ds = dataset_cls.make_dataset(shuffle=False, batch_size=1)
ds_iter = iter(ds)
data = next(ds_iter)
expected_subset = [
'image',
'keyframe_index',
'label',
'instances_position',
'instances_mask',
'instances_score',
'nonmerge_label',
'nonmerge_instances_position',
'detected_instances_position',
'detected_instances_mask',
'detected_instances_score',
]
self.assertSameElements(expected_subset, data.keys())
self.assertAllEqual(data['keyframe_index'], [[2]])
expected_label = tf.constant(
[[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 1., 1., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]])
expected_label = expected_label[None, ...]
self.assertAllEqual(data['label'], expected_label)
expected_instances_mask = tf.constant([True, True, True, False, False])
expected_instances_mask = expected_instances_mask[None, :]
self.assertAllEqual(data['instances_mask'], expected_instances_mask)
expected_nonmerge_label = tf.constant([0, 1, 2, 3, -1])
expected_nonmerge_label = expected_nonmerge_label[None, :]
self.assertAllEqual(data['nonmerge_label'], expected_nonmerge_label)
self.assertAllEqual(data['detected_instances_position'].shape, [1, 5, 4])
self.assertAllEqual(data['detected_instances_mask'].shape, [1, 5])
expected_detected_instances_mask = tf.constant(
[[True, True, False, False, False]])
self.assertAllEqual(data['detected_instances_mask'],
expected_detected_instances_mask)
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for video_classification."""
import io
from absl import logging
import numpy as np
from PIL import Image
import tensorflow as tf
from official.projects.videoglue.datasets import video_classification
from official.vision.configs import common as common_cfg
IMAGE_KEY = 'image/encoded'
LABEL_KEY = 'clip/label/index'
TFR_PATH = '/tmp/sample.tfrecord'
def create_fake_tfse_sstable():
"""Creates fake data."""
num_frames = 25
tfse = tf.train.SequenceExample()
tfse.context.feature.get_or_create(LABEL_KEY).int64_list.value[:] = [0]
for frame_id in range(num_frames):
image = np.ones((263, 320, 3), dtype=np.uint8) * frame_id
image = Image.fromarray(image)
with io.BytesIO() as buffer:
image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
tfse.feature_lists.feature_list.get_or_create(
IMAGE_KEY).feature.add().bytes_list.value[:] = [raw_image_bytes]
writer = tf.io.TFRecordWriter(TFR_PATH)
writer.write(tfse.SerializeToString())
logging.info('Writing tfrecord table: %s', TFR_PATH)
writer.close()
class VideoClassificationTest(tf.test.TestCase):
def test_create_video_classification_data(self):
create_fake_tfse_sstable()
dataset_cls = video_classification.VideoClassificationBaseFactory(
subset='train')
configs = {
'is_training': True,
'num_frames': 4,
'one_hot_label': True,
}
dataset_cls.configure(**configs)
ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
ds_iter = iter(ds)
data = next(ds_iter)
self.assertAllEqual(data['label'].shape, [2, 400])
self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
def test_video_classification_randaug(self):
create_fake_tfse_sstable()
dataset_cls = video_classification.VideoClassificationBaseFactory(
subset='train')
configs = {
'is_training': True,
'num_frames': 4,
'one_hot_label': True,
'randaug_params': common_cfg.RandAugment().as_dict(),
}
dataset_cls.configure(**configs)
ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
ds_iter = iter(ds)
data = next(ds_iter)
self.assertAllEqual(data['label'].shape, [2, 400])
self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
def test_video_classification_autoaug(self):
create_fake_tfse_sstable()
dataset_cls = video_classification.VideoClassificationBaseFactory(
subset='train')
configs = {
'is_training': True,
'num_frames': 4,
'one_hot_label': True,
'autoaug_params': common_cfg.AutoAugment().as_dict(),
}
dataset_cls.configure(**configs)
ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
ds_iter = iter(ds)
data = next(ds_iter)
self.assertAllEqual(data['label'].shape, [2, 400])
self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
def test_video_classification_mixup_cutmix(self):
create_fake_tfse_sstable()
dataset_cls = video_classification.VideoClassificationBaseFactory(
subset='train')
configs = {
'is_training': True,
'num_frames': 4,
'one_hot_label': True,
'mixup_cutmix_params': common_cfg.MixupAndCutmix().as_dict(),
}
dataset_cls.configure(**configs)
ds = dataset_cls.make_dataset(shuffle=False, batch_size=2)
ds_iter = iter(ds)
data = next(ds_iter)
self.assertAllEqual(data['label'].shape, [2, 400])
self.assertAllEqual(data['image'].shape, [2, 4, 224, 224, 3])
def test_video_classification_sample_segments(self):
create_fake_tfse_sstable()
dataset_cls = video_classification.VideoClassificationBaseFactory(
subset='train')
configs = {
'is_training': False,
'num_frames': 5,
'temporal_stride': 1,
'sample_from_segments': True,
'one_hot_label': True,
'mixup_cutmix_params': common_cfg.MixupAndCutmix().as_dict(),
}
dataset_cls.configure(**configs)
ds = dataset_cls.make_dataset(shuffle=False, batch_size=1)
ds_iter = iter(ds)
data = next(ds_iter)
self.assertAllEqual(data['label'].shape, [1, 400])
self.assertAllEqual(data['image'].shape, [1, 5, 224, 224, 3])
average_image = tf.reduce_mean(data['image'] * 255., axis=[2, 3, 4])
self.assertAllEqual(average_image[0].numpy(), [2.0, 7.0, 12.0, 16.0, 21.0])
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for spatiotemporal_action_localization_evaluator."""
import tensorflow as tf
from official.projects.videoglue.evaluation import spatiotemporal_action_localization_evaluator as eval_util
class SpatiotemporalActionLocalizationEvaluatorTest(tf.test.TestCase):
def _create_test_data_simple(self):
boxes = tf.convert_to_tensor(
[[[0.1, 0.15, 0.2, 0.25], [0.35, 0.18, 0.43, 0.4],
[0.2, 0.1, 0.3, 0.2], [0.65, 0.55, 0.75, 0.85]],
[[0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9],
[0.1, 0.4, 0.5, 0.7], [0.04, 0.05, 0.88, 0.77]]], dtype=tf.float32)
nonmerge_boxes = boxes
classes = tf.convert_to_tensor([[0, 2, 3, 4], [11, 12, 13, 14]],
dtype=tf.int32)
predictions = tf.one_hot(classes, depth=80)
data = {
'instances_position': boxes,
'nonmerge_instances_position': nonmerge_boxes,
'predictions': predictions,
'nonmerge_label': classes,
}
return data
def _create_test_data_complex(self):
nonmerge_boxes = tf.convert_to_tensor(
[[[0.1, 0.15, 0.2, 0.25], [0.1, 0.15, 0.2, 0.25],
[0.2, 0.1, 0.3, 0.2], [0.65, 0.55, 0.75, 0.85]],
[[0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9],
[0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9]]], dtype=tf.float32)
boxes = tf.convert_to_tensor(
[[[0.1, 0.15, 0.2, 0.25], [0.2, 0.1, 0.3, 0.2],
[0.65, 0.55, 0.75, 0.85], [-1, -1, -1, -1]],
[[0.2, 0.5, 0.5, 0.8], [0.7, 0.1, 0.9, 0.9],
[-1, -1, -1, -1], [-1, -1, -1, -1]]], dtype=tf.float32)
classes = tf.convert_to_tensor([[0, 2, 3, 4], [11, 12, 13, 14]],
dtype=tf.int32)
predictions = tf.one_hot(classes, depth=80)
data = {
'instances_position': boxes,
'nonmerge_instances_position': nonmerge_boxes,
'predictions': predictions,
'nonmerge_label': classes,
}
return data
def test_action_localization_eval_simple(self):
data = self._create_test_data_simple()
evaluator = eval_util.SpatiotemporalActionLocalizationEvaluator()
evaluator.reset_states()
evaluator.update_state(data)
metrics = evaluator.result()
self.assertAlmostEqual(metrics['mAP@.5IOU'], 1.0)
def test_action_localization_eval_complex(self):
data = self._create_test_data_complex()
evaluator = eval_util.SpatiotemporalActionLocalizationEvaluator()
evaluator.reset_states()
evaluator.update_state(data)
metrics = evaluator.result()
self.assertAlmostEqual(metrics['mAP@.5IOU'], 0.64375)
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for vit_3d."""
from absl.testing import parameterized
import tensorflow as tf
from official.projects.videoglue.modeling.backbones import vit_3d
class Vit3DTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(8, 224, 87718656),
(16, 256, 88204032),
)
def test_network_creation(self, num_frames, input_size, params_count):
"""Test creation of VisionTransformer family models."""
tf.keras.backend.set_image_data_format('channels_last')
input_specs = tf.keras.layers.InputSpec(
shape=[2, num_frames, input_size, input_size, 3])
network = vit_3d.VisionTransformer3D(input_specs=input_specs)
inputs = tf.keras.Input(
shape=(num_frames, input_size, input_size, 3), batch_size=1)
_ = network(inputs)
self.assertEqual(network.count_params(), params_count)
def test_network_none_pooler(self):
"""Tests creation of VisionTransformer family models."""
tf.keras.backend.set_image_data_format('channels_last')
num_frames = 8
input_size = 224
input_specs = tf.keras.layers.InputSpec(
shape=[2, num_frames, input_size, input_size, 3])
network = vit_3d.VisionTransformer3D(
input_specs=input_specs,
pooler='none',
representation_size=128)
inputs = tf.keras.Input(
shape=(num_frames, input_size, input_size, 3), batch_size=1)
endpoints = network(inputs)
self.assertEqual(endpoints['encoded_tokens'].shape, [1, 2, 14, 14, 128])
@parameterized.parameters('native', 'mae')
def test_network_convention(self, variant):
"""Tests creation of VisionTransformer family models."""
tf.keras.backend.set_image_data_format('channels_last')
num_frames = 8
input_size = 224
input_specs = tf.keras.layers.InputSpec(
shape=[2, num_frames, input_size, input_size, 3])
network = vit_3d.VisionTransformer3D(
variant=variant,
input_specs=input_specs,
pooler='none',
representation_size=128)
inputs = tf.keras.Input(
shape=(num_frames, input_size, input_size, 3), batch_size=1)
endpoints = network(inputs)
self.assertEqual(endpoints['encoded_tokens'].shape, [1, 2, 14, 14, 128])
def test_network_pos_embed_interpolation_mae(self):
"""Tests creation of VisionTransformer family models."""
tf.keras.backend.set_image_data_format('channels_last')
variant = 'mae'
pos_embed_shape = (8, 14, 14)
num_frames = 8
input_size = 256
input_specs = tf.keras.layers.InputSpec(
shape=[2, num_frames, input_size, input_size, 3])
network = vit_3d.VisionTransformer3D(
variant=variant,
input_specs=input_specs,
pooler='none',
representation_size=128,
pos_embed_shape=pos_embed_shape)
inputs = tf.keras.Input(
shape=(num_frames, input_size, input_size, 3), batch_size=1)
endpoints = network(inputs)
self.assertEqual(endpoints['encoded_tokens'].shape, [1, 2, 16, 16, 128])
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for action_transformer."""
import tensorflow as tf
from official.projects.videoglue.modeling.heads import action_transformer
class ActionTransformerTest(tf.test.TestCase):
def test_action_transformer_head_construction(self):
head = action_transformer.ActionTransformerHead(
num_hidden_layers=1,
num_hidden_channels=1024,
use_sync_bn=False,
num_classes=80,
# parameters for TxDecoder
num_tx_channels=128,
num_tx_layers=3,
num_tx_heads=3,
use_positional_embedding=True)
inputs = {
'features': tf.ones([2, 4, 16, 16, 128]),
'instances_position': tf.random.uniform([2, 6, 4]),
}
outputs = head(inputs, training=False)
self.assertAllEqual(outputs.shape, [2, 6, 80])
def test_action_transformer_linear_head_construction(self):
head = action_transformer.ActionTransformerHead(
num_hidden_layers=0,
num_hidden_channels=1024,
use_sync_bn=False,
num_classes=80,
dropout_rate=0.5,
# parameters for TxDecoder
num_tx_channels=128,
num_tx_layers=0,
num_tx_heads=3,
attention_dropout_rate=0.2,
use_positional_embedding=False)
inputs = {
'features': tf.ones([2, 4, 16, 16, 128]),
'instances_position': tf.random.uniform([2, 6, 4]),
}
outputs = head(inputs, training=False)
self.assertAllEqual(outputs.shape, [2, 6, 80])
trainable_weight_names = [w.name for w in head.weights]
expected_weight_names = [
'action_transformer_head/mlp/dense/kernel:0',
'action_transformer_head/mlp/dense/bias:0'
]
self.assertCountEqual(trainable_weight_names, expected_weight_names)
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for simple."""
import numpy as np
import tensorflow as tf
from official.projects.videoglue.modeling.heads import simple
class SimpleTest(tf.test.TestCase):
def test_mlp_construction(self):
mlp_head = simple.MLP(
num_hidden_layers=3,
num_hidden_channels=128,
num_output_channels=56,
use_sync_bn=False,
activation='relu')
inputs = tf.zeros([2, 512])
outputs = mlp_head(inputs, training=False)
num_params = np.sum(
[np.prod(v.get_shape()) for v in mlp_head.trainable_weights])
self.assertEqual(num_params, 106296)
self.assertAllEqual(outputs.shape, [2, 56])
def test_att_pooler_classifier_construction(self):
pooler_head = simple.AttentionPoolerClassificationHead(
num_heads=6,
hidden_size=768,
num_classes=3)
inputs = tf.zeros([2, 16, 768])
inputs = tf.reshape(inputs, (2, 4, 4, 768))
outputs = pooler_head(inputs, training=False)
self.assertAllEqual(outputs.shape, [2, 3])
def test_att_pooler_classifier_construction_with_posembed(self):
pooler_head = simple.AttentionPoolerClassificationHead(
num_heads=6,
hidden_size=768,
num_classes=3,
add_temporal_pos_embed=True)
inputs = tf.zeros([2, 16, 768])
inputs = tf.reshape(inputs, (2, 4, 4, 768))
outputs = pooler_head(inputs, training=False)
self.assertAllEqual(outputs.shape, [2, 3])
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for TransformerDecoder."""
import tensorflow as tf
from official.projects.videoglue.modeling.heads import transformer_decoder
class TransformerTest(tf.test.TestCase):
def test_decoder_unit_return_shape(self):
decoder_unit = transformer_decoder.DecoderUnit(
num_channels=128,
use_bias=True,
dropout_rate=0.5,
activation='relu',
layer_norm_epsilon=1e-7)
batch_size = 16
num_inputs = 128
num_channels = 256
input_tensor = tf.zeros([batch_size, num_inputs, num_channels])
memory_tensor = tf.ones([batch_size, num_inputs * 4, num_channels])
outputs = decoder_unit(input_tensor, memory_tensor, training=False)
self.assertAllEqual(outputs['hidden_states'].shape,
[batch_size, num_inputs, num_inputs])
self.assertAllEqual(outputs['attention_weights'].shape,
[batch_size, num_inputs, 4 * num_inputs])
def test_decoder_layer_return_shape(self):
decoder_layer = transformer_decoder.TransformerDecoderLayer(
num_channels=128,
num_heads=3,
use_bias=True,
dropout_rate=0.5,
activation='relu',
layer_norm_epsilon=1e-7)
batch_size = 16
num_inputs = 128
num_channels = 256
input_tensor = tf.zeros([batch_size, num_inputs, num_channels])
memory_tensor = tf.ones([batch_size, num_inputs * 4, num_channels])
outputs = decoder_layer(input_tensor, memory_tensor, training=False)
self.assertAllEqual(outputs['hidden_states'].shape,
[batch_size, num_inputs, num_inputs * 3])
self.assertAllEqual(outputs['attention_weights'][-1].shape,
[batch_size, num_inputs, 4 * num_inputs])
def test_decoder_return_shape(self):
decoder = transformer_decoder.TransformerDecoder(
num_channels=128,
num_layers=5,
num_heads=3,
use_bias=True,
dropout_rate=0.5,
activation='relu',
layer_norm_epsilon=1e-7)
batch_size = 16
num_inputs = 128
num_channels = 256
input_tensor = tf.zeros([batch_size, num_inputs, num_channels])
memory_tensor = tf.ones([batch_size, num_inputs * 4, num_channels])
outputs = decoder(input_tensor, memory_tensor, training=False)
self.assertLen(outputs['attention_weights'], 5)
self.assertAllEqual(outputs['hidden_states'][-1].shape,
[batch_size, num_inputs, num_inputs * 3])
self.assertAllEqual(outputs['attention_weights'][-1][-1].shape,
[batch_size, num_inputs, 4 * num_inputs])
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for video_action_transformer_model."""
import tensorflow as tf
from official.projects.videoglue.configs import spatiotemporal_action_localization as cfg
from official.projects.videoglue.modeling import video_action_transformer_model
class VideoActionTransformerModelTest(tf.test.TestCase):
def test_video_action_transformer_model_construction(self):
model_config = cfg.VideoActionTransformerModel()
input_specs = {
'image': tf.keras.layers.InputSpec(shape=[None, 4, 20, 20, 3]),
'instances_position': tf.keras.layers.InputSpec(shape=[None, 8, 4])
}
model = video_action_transformer_model.build_video_action_transformer_model(
input_specs_dict=input_specs,
model_config=model_config,
num_classes=80)
self.assertIsInstance(
model, video_action_transformer_model.VideoActionTransformerModel)
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for video classification network."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.projects.videoglue.modeling import video_classification_model
from official.vision.modeling import backbones
class MultiHeadsVideoClassificationNetworkTest(
parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(50, 8, 112, 'relu', False),
(50, 8, 112, 'swish', True),
)
def test_resnet3d_network_creation(self, model_id, temporal_size,
spatial_size, activation,
aggregate_endpoints):
"""Test for creation of a ResNet3D-50 classifier."""
input_specs = tf.keras.layers.InputSpec(
shape=[None, temporal_size, spatial_size, spatial_size, 3])
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
input_specs=input_specs,
activation=activation)
num_classes = 1000
model = video_classification_model.MultiHeadVideoClassificationModel(
backbone=backbone,
num_classes=num_classes,
input_specs={'image': input_specs},
dropout_rate=0.2,
aggregate_endpoints=aggregate_endpoints,
)
inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
logits = model(inputs)
self.assertAllEqual([2, num_classes], logits.numpy().shape)
@parameterized.parameters(
(50, 8, 112, 'relu', False),
(50, 8, 112, 'swish', False),
)
def test_resnet3d_network_pooler_head_creation(
self, model_id, temporal_size, spatial_size, activation,
aggregate_endpoints):
"""Test for creation of a ResNet3D-50 classifier."""
input_specs = tf.keras.layers.InputSpec(
shape=[None, temporal_size, spatial_size, spatial_size, 3])
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
input_specs=input_specs,
activation=activation)
num_classes = 1000
model = video_classification_model.MultiHeadVideoClassificationModel(
backbone=backbone,
num_classes=num_classes,
input_specs={'image': input_specs},
dropout_rate=0.2,
aggregate_endpoints=aggregate_endpoints,
classifier_type='pooler')
inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
logits = model(inputs)
self.assertAllEqual([2, num_classes], logits.numpy().shape)
@parameterized.parameters(
(50, 8, 112, 'relu', False),
(50, 8, 112, 'swish', True),
)
def test_resnet3d_network_multiheads_creation(
self, model_id, temporal_size, spatial_size, activation,
aggregate_endpoints):
"""Test for creation of a ResNet3D-50 multiheads classifier."""
input_specs = tf.keras.layers.InputSpec(
shape=[None, temporal_size, spatial_size, spatial_size, 3])
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
input_specs=input_specs,
activation=activation)
num_classes = [1000, 100, 10]
model = video_classification_model.MultiHeadVideoClassificationModel(
backbone=backbone,
num_classes=num_classes,
input_specs={'image': input_specs},
dropout_rate=0.2,
aggregate_endpoints=aggregate_endpoints,
)
inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
logits = model(inputs)
for i in range(3):
self.assertAllEqual([2, num_classes[i]], logits[i].numpy().shape)
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for video_classification."""
import tensorflow as tf
# pylint: disable=unused-import
from official.modeling import optimization
from official.projects.videoglue.configs import video_classification as exp_cfg
from official.projects.videoglue.modeling import video_classification_model
from official.projects.videoglue.modeling.backbones import vit_3d
from official.projects.videoglue.tasks import multihead_video_classification
# pylint: enable=unused-import
class MultiheadVideoClassificationTest(tf.test.TestCase):
def test_one_head_video_classification(self):
config = exp_cfg.mh_video_classification()
config.task.train_data.global_batch_size = 2
config.task.train_data.num_classes = 400
config.task.validation_data.num_classes = 400
config.task.train_data.feature_shape = (16, 56, 56, 3)
config.task.validation_data.feature_shape = (16, 56, 56, 3)
task = multihead_video_classification.MultiHeadVideoClassificationTask(
config.task)
model = task.build_model()
metrics = task.build_metrics()
data_inputs = {
'image': tf.ones([2, 16, 56, 56, 3], tf.float32),
'label': tf.ones([2, 400], tf.float32),
}
opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('label/accuracy', logs)
self.assertIn('label/top_1_accuracy', logs)
self.assertIn('label/top_5_accuracy', logs)
logs = task.validation_step(data_inputs, model, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('label/accuracy', logs)
self.assertIn('label/top_1_accuracy', logs)
self.assertIn('label/top_5_accuracy', logs)
def test_one_head_video_classification_multilabel(self):
config = exp_cfg.mh_video_classification()
config.task.train_data.global_batch_size = 2
config.task.train_data.num_classes = 400
config.task.train_data.is_multilabel = True
config.task.validation_data.num_classes = 400
config.task.train_data.feature_shape = (16, 56, 56, 3)
config.task.validation_data.feature_shape = (16, 56, 56, 3)
config.task.validation_data.is_multilabel = True
task = multihead_video_classification.MultiHeadVideoClassificationTask(
config.task)
model = task.build_model()
metrics = task.build_metrics()
data_inputs = {
'image': tf.ones([2, 16, 56, 56, 3], tf.float32),
'label': tf.ones([2, 400], tf.float32),
}
opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('label/ROC-AUC', logs)
self.assertIn('label/PR-AUC', logs)
self.assertIn('label/RecallAtPrecision95', logs)
logs = task.validation_step(data_inputs, model, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('label/ROC-AUC', logs)
self.assertIn('label/PR-AUC', logs)
self.assertIn('label/RecallAtPrecision95', logs)
def test_multi_head_video_classification(self):
config = exp_cfg.mh_video_classification()
config.task.train_data.global_batch_size = 2
config.task.train_data.num_classes = [123, 456]
config.task.train_data.label_names = ['label_a', 'label_b']
config.task.validation_data.num_classes = [123, 456]
config.task.validation_data.label_names = ['label_a', 'label_b']
config.task.train_data.feature_shape = (16, 56, 56, 3)
config.task.validation_data.feature_shape = (16, 56, 56, 3)
task = multihead_video_classification.MultiHeadVideoClassificationTask(
config.task)
model = task.build_model()
metrics = task.build_metrics()
data_inputs = {
'image': tf.ones([2, 16, 56, 56, 3], tf.float32),
'label_a': tf.ones([2, 123], tf.float32),
'label_b': tf.ones([2, 456], tf.float32),
}
opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('label_a/accuracy', logs)
self.assertIn('label_a/top_1_accuracy', logs)
self.assertIn('label_a/top_5_accuracy', logs)
self.assertIn('label_b/accuracy', logs)
self.assertIn('label_b/top_1_accuracy', logs)
self.assertIn('label_b/top_5_accuracy', logs)
self.assertIn('label_joint/accuracy', logs)
logs = task.validation_step(data_inputs, model, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('label_a/accuracy', logs)
self.assertIn('label_a/top_1_accuracy', logs)
self.assertIn('label_a/top_5_accuracy', logs)
self.assertIn('label_b/accuracy', logs)
self.assertIn('label_b/top_1_accuracy', logs)
self.assertIn('label_b/top_5_accuracy', logs)
self.assertIn('label_joint/accuracy', logs)
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for spatiotemporal_action_localization."""
import tensorflow as tf
from official.core import exp_factory
from official.modeling import optimization
from official.projects.videoglue.modeling import video_action_transformer_model # pylint: disable=unused-import
from official.projects.videoglue.tasks import spatiotemporal_action_localization as stal_task
class SpatiotemporalActionLocalizationTest(tf.test.TestCase):
def test_spatiotemporal_action_localization(self):
config = exp_factory.get_exp_config('spatiotemporal_action_localization')
config.task.train_data.global_batch_size = 2
config.task.train_data.feature_shape = (32, 56, 56, 3)
config.task.validation_data.global_batch_size = 2
config.task.validation_data.feature_shape = (32, 56, 56, 3)
config.task.losses.l2_weight_decay = 1e-7
task = stal_task.SpatiotemporalActionLocalizationTask(config.task)
model = task.build_model()
metrics = task.build_metrics()
data_inputs = {
'image': tf.ones([2, 32, 56, 56, 3], tf.float32),
'instances_position': tf.ones([2, 32, 4], tf.float32),
'instances_score': tf.ones([2, 32], tf.float32),
'instances_mask': tf.ones([2, 32], tf.float32),
'label': tf.ones([2, 32, 80], tf.float32),
'nonmerge_label': tf.ones([2, 32, 80], tf.float32),
'nonmerge_instances_position': tf.ones([2, 32, 4], tf.float32),
}
opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
logs = task.train_step(data_inputs, model, optimizer, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('model_loss', logs)
self.assertIn('regularization_loss', logs)
logs = task.validation_step(data_inputs, model, metrics=metrics)
self.assertIn('loss', logs)
self.assertIn('model_loss', logs)
self.assertIn('regularization_loss', logs)
self.assertIn('nonmerge_label', logs)
self.assertIn('nonmerge_instances_position', logs)
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册