# model settings model = dict( type='Recognizer3D', backbone=dict( type='ResNet2Plus1d', depth=34, pretrained=None, pretrained2d=False, norm_eval=False, conv_cfg=dict(type='Conv2plus1d'), norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3), act_cfg=dict(type='ReLU'), conv1_kernel=(3, 7, 7), conv1_stride_t=1, pool1_stride_t=1, inflate=(1, 1, 1, 1), spatial_strides=(1, 2, 2, 2), temporal_strides=(1, 2, 2, 2), zero_init_residual=False), cls_head=dict( type='I3DHead', num_classes=400, in_channels=512, spatial_type='avg', dropout_ratio=0.5, init_std=0.01)) # model training and testing settings train_cfg = None test_cfg = dict(average_clips=None) # dataset settings dataset_type = 'VideoDataset' data_root = 'data/kinetics400/videos_train' data_root_val = 'data/kinetics400/videos_val' ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='RandomResizedCrop'), dict(type='Resize', scale=(224, 224), keep_ratio=False), dict(type='Flip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['imgs', 'label']) ] val_pipeline = [ dict(type='DecordInit'), dict( type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['imgs']) ] test_pipeline = [ dict(type='DecordInit'), dict( type='SampleFrames', clip_len=8, frame_interval=8, num_clips=10, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='ThreeCrop', crop_size=256), dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['imgs']) ] data = dict( videos_per_gpu=16, workers_per_gpu=4, train=dict( type=dataset_type, ann_file=ann_file_train, data_prefix=data_root, pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=ann_file_val, data_prefix=data_root_val, pipeline=val_pipeline, test_mode=True), test=dict( type=dataset_type, ann_file=ann_file_val, data_prefix=data_root_val, pipeline=test_pipeline, test_mode=True)) # optimizer optimizer = dict( type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0001) # this lr is used for 8 gpus optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) # learning policy lr_config = dict(policy='CosineAnnealing', min_lr=0) total_epochs = 180 checkpoint_config = dict(interval=5) evaluation = dict( interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) log_config = dict( interval=20, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook'), ]) # runtime settings dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/r2plus1d_r34_video_3d_8x8x1_180e_kinetics400_rgb/' load_from = None resume_from = None workflow = [('train', 1)] find_unused_parameters = False