# model settings model = dict( type='Recognizer2D', backbone=dict( type='ResNet', pretrained='torchvision://resnet50', depth=50, norm_eval=False), cls_head=dict( type='TSNHead', num_classes=400, in_channels=2048, spatial_type='avg', consensus=dict(type='AvgConsensus', dim=1), dropout_ratio=0.4, init_std=0.01)) # model training and testing settings train_cfg = None test_cfg = dict(average_clips=None) # dataset settings dataset_type = 'RawframeDataset' data_root = 'data/kinetics400/rawframes_train' data_root_val = 'data/kinetics400/rawframes_val' ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), dict( type='MultiScaleCrop', input_size=224, scales=(1, 0.875, 0.75, 0.66), random_crop=False, max_wh_scale_gap=1), dict(type='Resize', scale=(224, 224), keep_ratio=False), dict(type='Flip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['imgs', 'label']) ] val_pipeline = [ dict( type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3, test_mode=True), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['imgs']) ] test_pipeline = [ dict( type='SampleFrames', clip_len=1, frame_interval=1, num_clips=25, test_mode=True), dict(type='RawFrameDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='TenCrop', crop_size=224), dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['imgs']) ] data = dict( videos_per_gpu=32, workers_per_gpu=4, train=dict( type=dataset_type, ann_file=ann_file_train, data_prefix=data_root, pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=ann_file_val, data_prefix=data_root_val, pipeline=val_pipeline), test=dict( type=dataset_type, ann_file=ann_file_test, data_prefix=data_root_val, pipeline=test_pipeline)) # optimizer optimizer = dict( type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) # this lr is used for 8 gpus optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) # learning policy lr_config = dict(policy='step', step=[40, 80]) total_epochs = 100 checkpoint_config = dict(interval=5) evaluation = dict( interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) log_config = dict( interval=20, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook'), ]) # runtime settings dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/tsn_r50_1x1x3_100e_kinetics400_rgb/' load_from = None resume_from = None workflow = [('train', 1)]