From 0a54afe0fbdad609ae633dc21b58c32d2b752c0c Mon Sep 17 00:00:00 2001 From: Evan Shelhamer Date: Thu, 19 May 2016 15:15:15 -0700 Subject: [PATCH] add FCN AlexNet for PASCAL VOC --- README.md | 7 +- voc-fcn-alexnet/caffemodel-url | 1 + voc-fcn-alexnet/net.py | 67 ++++++++ voc-fcn-alexnet/solve.py | 28 ++++ voc-fcn-alexnet/solver.prototxt | 19 +++ voc-fcn-alexnet/train.prototxt | 273 ++++++++++++++++++++++++++++++++ voc-fcn-alexnet/val.prototxt | 273 ++++++++++++++++++++++++++++++++ 7 files changed, 665 insertions(+), 3 deletions(-) create mode 100644 voc-fcn-alexnet/caffemodel-url create mode 100644 voc-fcn-alexnet/net.py create mode 100644 voc-fcn-alexnet/solve.py create mode 100644 voc-fcn-alexnet/solver.prototxt create mode 100644 voc-fcn-alexnet/train.prototxt create mode 100644 voc-fcn-alexnet/val.prototxt diff --git a/README.md b/README.md index 3d95321..59c6d91 100644 --- a/README.md +++ b/README.md @@ -23,13 +23,14 @@ The "at-once" FCN-8s is fine-tuned from VGG-16 all-at-once by scaling the skip c * [FCN-8s PASCAL](tree/master/fcn8s): three stream, 8 pixel prediction stride version, scoring 65.5 mIU on seg11valid and 67.2 mIU on seg12test * [FCN-8s PASCAL at-once](tree/master/fcn8s): all-at-once edition of the three stream, 8 pixel prediction stride version, scoring 65.4 mIU on seg11valid +[FCN-AlexNet PASCAL](tree/master/voc-fcn-alexnet): AlexNet (CaffeNet) architecture, single stream, 32 pixel prediction stride net, scoring 48.0 mIU on seg11valid. +Unlike the FCN-32/16/8s models, this network is trained with gradient accumulation, normalized loss, and standard momentum. +(Note: when both FCN-32s/FCN-VGG16 and FCN-AlexNet are trained in this same way FCN-VGG16 is far better; see Table 1 of the paper.) + To reproduce the validation scores, use the [seg11valid](https://github.com/shelhamer/fcn.berkeleyvision.org/blob/master/data/pascal/seg11valid.txt) split defined by the paper in footnote 7. Since SBD train and PASCAL VOC 2011 segval intersect, we only evaluate on the non-intersecting set for validation purposes. **The following models have not yet been ported to master and trained with the latest settings. Check back soon.** -PASCAL VOC: -* [FCN-AlexNet PASCAL](https://gist.github.com/shelhamer/3f2c75f3c8c71357f24c#file-readme.md): AlexNet (CaffeNet) single stream, 32 pixel prediction stride version - SIFT Flow model (also fine-tuned from VGG-16): * [FCN-16s SIFT Flow](https://gist.github.com/longjon/f35e3a101e1478f721f5#file-readme-md): two stream, 16 pixel prediction stride version diff --git a/voc-fcn-alexnet/caffemodel-url b/voc-fcn-alexnet/caffemodel-url new file mode 100644 index 0000000..d8a63a9 --- /dev/null +++ b/voc-fcn-alexnet/caffemodel-url @@ -0,0 +1 @@ +http://dl.caffe.berkeleyvision.org/fcn-alexnet-pascal.caffemodel diff --git a/voc-fcn-alexnet/net.py b/voc-fcn-alexnet/net.py new file mode 100644 index 0000000..91eae6c --- /dev/null +++ b/voc-fcn-alexnet/net.py @@ -0,0 +1,67 @@ +import sys +sys.path.append('../../python') + +import caffe +from caffe import layers as L, params as P +from caffe.coord_map import crop + +def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1): + conv = L.Convolution(bottom, kernel_size=ks, stride=stride, + num_output=nout, pad=pad, group=group) + return conv, L.ReLU(conv, in_place=True) + +def max_pool(bottom, ks, stride=1): + return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride) + +def fcn(split): + n = caffe.NetSpec() + pydata_params = dict(split=split, mean=(104.00699, 116.66877, 122.67892), + seed=1337) + if split == 'train': + pydata_params['sbdd_dir'] = '../../data/sbdd/dataset' + pylayer = 'SBDDSegDataLayer' + else: + pydata_params['voc_dir'] = '../../data/pascal/VOC2011' + pylayer = 'VOCSegDataLayer' + n.data, n.label = L.Python(module='layers', layer=pylayer, + ntop=2, param_str=str(pydata_params)) + + # the base net + n.conv1, n.relu1 = conv_relu(n.data, 11, 96, stride=4, pad=100) + n.pool1 = max_pool(n.relu1, 3, stride=2) + n.norm1 = L.LRN(n.pool1, local_size=5, alpha=1e-4, beta=0.75) + n.conv2, n.relu2 = conv_relu(n.norm1, 5, 256, pad=2, group=2) + n.pool2 = max_pool(n.relu2, 3, stride=2) + n.norm2 = L.LRN(n.pool2, local_size=5, alpha=1e-4, beta=0.75) + n.conv3, n.relu3 = conv_relu(n.norm2, 3, 384, pad=1) + n.conv4, n.relu4 = conv_relu(n.relu3, 3, 384, pad=1, group=2) + n.conv5, n.relu5 = conv_relu(n.relu4, 3, 256, pad=1, group=2) + n.pool5 = max_pool(n.relu5, 3, stride=2) + + # fully conv + n.fc6, n.relu6 = conv_relu(n.pool5, 6, 4096) + n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) + n.fc7, n.relu7 = conv_relu(n.drop6, 1, 4096) + n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) + + n.score_fr = L.Convolution(n.drop7, num_output=21, kernel_size=1, pad=0, + param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) + n.upscore = L.Deconvolution(n.score_fr, + convolution_param=dict(num_output=21, kernel_size=63, stride=32, + bias_term=False), + param=[dict(lr_mult=0)]) + n.score = crop(n.upscore, n.data) + n.loss = L.SoftmaxWithLoss(n.score, n.label, + loss_param=dict(normalize=False, ignore_label=255)) + + return n.to_proto() + +def make_net(): + with open('train.prototxt', 'w') as f: + f.write(str(fcn('train'))) + + with open('val.prototxt', 'w') as f: + f.write(str(fcn('seg11valid'))) + +if __name__ == '__main__': + make_net() diff --git a/voc-fcn-alexnet/solve.py b/voc-fcn-alexnet/solve.py new file mode 100644 index 0000000..c9f8e71 --- /dev/null +++ b/voc-fcn-alexnet/solve.py @@ -0,0 +1,28 @@ +import caffe +import surgery, score + +import numpy as np +import os + +import setproctitle +setproctitle.setproctitle(os.path.basename(os.getcwd())) + +weights = '../alexnetfc.caffemodel' + +# init +caffe.set_device(int(sys.argv[1])) +caffe.set_mode_gpu() + +solver = caffe.SGDSolver('solver.prototxt') +solver.net.copy_from(weights) + +# surgeries +interp_layers = [k for k in solver.net.params.keys() if 'up' in k] +surgery.interp(solver.net, interp_layers) + +# scoring +val = np.loadtxt('../data/segvalid11.txt', dtype=str) + +for _ in range(25): + solver.step(4000) + score.seg_tests(solver, False, val, layer='score') diff --git a/voc-fcn-alexnet/solver.prototxt b/voc-fcn-alexnet/solver.prototxt new file mode 100644 index 0000000..b52b29d --- /dev/null +++ b/voc-fcn-alexnet/solver.prototxt @@ -0,0 +1,19 @@ +train_net: "train.prototxt" +test_net: "val.prototxt" +test_iter: 736 +# make test net, but don't invoke it from the solver itself +test_interval: 999999999 +display: 20 +average_loss: 20 +lr_policy: "fixed" +# lr for normalized softmax +base_lr: 1e-4 +# standard momentum +momentum: 0.9 +# gradient accumulation +iter_size: 20 +max_iter: 100000 +weight_decay: 0.0005 +snapshot: 4000 +snapshot_prefix: "snapshot/train" +test_initialization: false diff --git a/voc-fcn-alexnet/train.prototxt b/voc-fcn-alexnet/train.prototxt new file mode 100644 index 0000000..0571cbf --- /dev/null +++ b/voc-fcn-alexnet/train.prototxt @@ -0,0 +1,273 @@ +layer { + name: "data" + type: "Python" + top: "data" + top: "label" + python_param { + module: "layers" + layer: "SBDDSegDataLayer" + param_str: "{\'sbdd_dir\': \'../../data/sbdd/dataset\', \'seed\': 1337, \'split\': \'train\', \'mean\': (104.00699, 116.66877, 122.67892)}" + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + convolution_param { + num_output: 96 + pad: 100 + kernel_size: 11 + group: 1 + stride: 4 + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm1" + type: "LRN" + bottom: "pool1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + stride: 1 + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm2" + type: "LRN" + bottom: "pool2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 1 + stride: 1 + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + stride: 1 + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + stride: 1 + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "Convolution" + bottom: "pool5" + top: "fc6" + convolution_param { + num_output: 4096 + pad: 0 + kernel_size: 6 + group: 1 + stride: 1 + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "Convolution" + bottom: "fc6" + top: "fc7" + convolution_param { + num_output: 4096 + pad: 0 + kernel_size: 1 + group: 1 + stride: 1 + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "score_fr" + type: "Convolution" + bottom: "fc7" + top: "score_fr" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 21 + pad: 0 + kernel_size: 1 + } +} +layer { + name: "upscore" + type: "Deconvolution" + bottom: "score_fr" + top: "upscore" + param { + lr_mult: 0 + } + convolution_param { + num_output: 21 + bias_term: false + kernel_size: 63 + stride: 32 + } +} +layer { + name: "score" + type: "Crop" + bottom: "upscore" + bottom: "data" + top: "score" + crop_param { + axis: 2 + offset: 18 + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "score" + bottom: "label" + top: "loss" + loss_param { + ignore_label: 255 + normalize: false + } +} diff --git a/voc-fcn-alexnet/val.prototxt b/voc-fcn-alexnet/val.prototxt new file mode 100644 index 0000000..8595497 --- /dev/null +++ b/voc-fcn-alexnet/val.prototxt @@ -0,0 +1,273 @@ +layer { + name: "data" + type: "Python" + top: "data" + top: "label" + python_param { + module: "layers" + layer: "VOCSegDataLayer" + param_str: "{\'voc_dir\': \'../../data/pascal/VOC2011\', \'seed\': 1337, \'split\': \'seg11valid\', \'mean\': (104.00699, 116.66877, 122.67892)}" + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + convolution_param { + num_output: 96 + pad: 100 + kernel_size: 11 + group: 1 + stride: 4 + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm1" + type: "LRN" + bottom: "pool1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + stride: 1 + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm2" + type: "LRN" + bottom: "pool2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 1 + stride: 1 + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + stride: 1 + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + stride: 1 + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "Convolution" + bottom: "pool5" + top: "fc6" + convolution_param { + num_output: 4096 + pad: 0 + kernel_size: 6 + group: 1 + stride: 1 + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "Convolution" + bottom: "fc6" + top: "fc7" + convolution_param { + num_output: 4096 + pad: 0 + kernel_size: 1 + group: 1 + stride: 1 + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "score_fr" + type: "Convolution" + bottom: "fc7" + top: "score_fr" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 21 + pad: 0 + kernel_size: 1 + } +} +layer { + name: "upscore" + type: "Deconvolution" + bottom: "score_fr" + top: "upscore" + param { + lr_mult: 0 + } + convolution_param { + num_output: 21 + bias_term: false + kernel_size: 63 + stride: 32 + } +} +layer { + name: "score" + type: "Crop" + bottom: "upscore" + bottom: "data" + top: "score" + crop_param { + axis: 2 + offset: 18 + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "score" + bottom: "label" + top: "loss" + loss_param { + ignore_label: 255 + normalize: false + } +} -- GitLab