未验证 提交 1be468bf 编写于 作者: G Guoxia Wang 提交者: GitHub

add a model PLSC-ViT (#5697) (#5706)

* add a model and app for VisionTransformer
Co-authored-by: NliuTINA0907 <65896652+liuTINA0907@users.noreply.github.com>
Co-authored-by: easywaytolifebelief's avatarqizhaoaoe <2285142981@qq.com>
Co-authored-by: NliuTINA0907 <65896652+liuTINA0907@users.noreply.github.com>
上级 5f141572
import gradio as gr
from predictor import Predictor
model_path = "paddlecv://models/vit/v2.4/imagenet2012-ViT-B_16-224_infer.pdmodel"
params_path = "paddlecv://models/vit/v2.4/imagenet2012-ViT-B_16-224_infer.pdiparams"
label_path = "paddlecv://dataset/imagenet2012_labels.txt"
predictor = None
def model_inference(image):
global predictor
if predictor is None:
predictor = Predictor(
model_path=model_path,
params_path=params_path,
label_path=label_path)
scores, labels = predictor.predict(image)
json_out = {"scores": scores.tolist(), "labels": labels.tolist()}
return image, json_out
def clear_all():
return None, None, None
with gr.Blocks() as demo:
gr.Markdown("Classification based on ViT")
with gr.Column(scale=1, min_width=100):
img_in = gr.Image(
value="https://plsc.bj.bcebos.com/dataset/test_images/cat.jpg",
label="Input")
with gr.Row():
btn1 = gr.Button("Clear")
btn2 = gr.Button("Submit")
img_out = gr.Image(label="Output")
json_out = gr.JSON(label="jsonOutput")
btn2.click(fn=model_inference, inputs=img_in, outputs=[img_out, json_out])
btn1.click(fn=clear_all, inputs=None, outputs=[img_in, img_out, json_out])
gr.Button.style(1)
demo.launch()
【PLSC-ViT-App-YAML】
APP_Info:
title: PLSC-ViT-App
colorFrom: blue
colorTo: yellow
sdk: gradio
sdk_version: 3.9.1
app_file: app.py
license: apache-2.0
device: cpu
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import os.path as osp
import sys
import yaml
import time
import shutil
import requests
import tqdm
import hashlib
import base64
import binascii
import tarfile
import zipfile
__all__ = [
'get_model_path',
'get_config_path',
'get_dict_path',
'get_data_path',
]
WEIGHTS_HOME = osp.expanduser("~/.cache/paddlecv/models/plsc")
CONFIGS_HOME = osp.expanduser("~/.cache/paddlecv/configs/plsc")
DICTS_HOME = osp.expanduser("~/.cache/paddlecv/dicts/plsc/")
DATA_HOME = osp.expanduser("~/.cache/paddlecv/dataset/plsc")
# dict of {dataset_name: (download_info, sub_dirs)}
# download info: [(url, md5sum)]
DOWNLOAD_RETRY_LIMIT = 3
PMP_DOWNLOAD_URL_PREFIX = 'https://plsc.bj.bcebos.com/'
def is_url(path):
"""
Whether path is URL.
Args:
path (string): URL string or not.
"""
return path.startswith('http://') \
or path.startswith('https://') \
or path.startswith('paddlecv://')
def parse_url(url):
url = url.replace("paddlecv://", PMP_DOWNLOAD_URL_PREFIX)
return url
def get_model_path(path):
"""Get model path from WEIGHTS_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, WEIGHTS_HOME, path_depth=3)
return path
def get_data_path(path):
"""Get model path from DATA_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, DATA_HOME, path_depth=1)
return path
def get_config_path(path):
"""Get config path from CONFIGS_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, CONFIGS_HOME)
return path
def get_dict_path(path):
"""Get config path from CONFIGS_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, DICTS_HOME)
return path
def map_path(url, root_dir, path_depth=1):
# parse path after download to decompress under root_dir
assert path_depth > 0, "path_depth should be a positive integer"
dirname = url
for _ in range(path_depth):
dirname = osp.dirname(dirname)
fpath = osp.relpath(url, dirname)
path = osp.join(root_dir, fpath)
dirname = osp.dirname(path)
return path, dirname
def get_path(url, root_dir, md5sum=None, check_exist=True, path_depth=1):
""" Download from given url to root_dir.
if file or directory specified by url is exists under
root_dir, return the path directly, otherwise download
from url, return the path.
url (str): download url
root_dir (str): root dir for downloading, it should be
WEIGHTS_HOME
md5sum (str): md5 sum of download package
"""
# parse path after download to decompress under root_dir
fullpath, dirname = map_path(url, root_dir, path_depth)
if osp.exists(fullpath) and check_exist:
if not osp.isfile(fullpath) or \
_check_exist_file_md5(fullpath, md5sum, url):
return fullpath, True
else:
os.remove(fullpath)
fullname = _download(url, dirname, md5sum)
return fullpath, False
def _download(url, path, md5sum=None):
"""
Download from url, save to path.
url (str): download url
path (str): download to given path
"""
if not osp.exists(path):
os.makedirs(path)
fname = osp.split(url)[-1]
fullname = osp.join(path, fname)
retry_cnt = 0
while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,
url)):
if retry_cnt < DOWNLOAD_RETRY_LIMIT:
retry_cnt += 1
else:
raise RuntimeError("Download from {} failed. "
"Retry limit reached".format(url))
# NOTE: windows path join may incur \, which is invalid in url
if sys.platform == "win32":
url = url.replace('\\', '/')
req = requests.get(url, stream=True)
if req.status_code != 200:
raise RuntimeError("Downloading from {} failed with code "
"{}!".format(url, req.status_code))
# For protecting download interupted, download to
# tmp_fullname firstly, move tmp_fullname to fullname
# after download finished
tmp_fullname = fullname + "_tmp"
total_size = req.headers.get('content-length')
with open(tmp_fullname, 'wb') as f:
if total_size:
for chunk in tqdm.tqdm(
req.iter_content(chunk_size=1024),
total=(int(total_size) + 1023) // 1024,
unit='KB'):
f.write(chunk)
else:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
shutil.move(tmp_fullname, fullname)
return fullname
def _check_exist_file_md5(filename, md5sum, url):
# if md5sum is None, and file to check is model file,
# read md5um from url and check, else check md5sum directly
return _md5check_from_url(filename, url) if md5sum is None \
and filename.endswith('pdparams') \
else _md5check(filename, md5sum)
def _md5check_from_url(filename, url):
# For model in bcebos URLs, MD5 value is contained
# in request header as 'content_md5'
req = requests.get(url, stream=True)
content_md5 = req.headers.get('content-md5')
req.close()
if not content_md5 or _md5check(
filename,
binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode(
)):
return True
else:
return False
def _md5check(fullname, md5sum=None):
if md5sum is None:
return True
md5 = hashlib.md5()
with open(fullname, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
md5.update(chunk)
calc_md5sum = md5.hexdigest()
if calc_md5sum != md5sum:
return False
return True
import os
import cv2
import numpy as np
import paddle
from download import get_model_path, get_data_path
class Predictor(object):
def __init__(self,
model_type="paddle",
model_path=None,
params_path=None,
label_path=None):
'''
model_path: str, http url
params_path: str, http url, could be downloaded
'''
assert model_type in ["paddle"]
assert model_path is not None and os.path.splitext(model_path)[
1] == '.pdmodel'
assert params_path is not None and os.path.splitext(params_path)[
1] == '.pdiparams'
import paddle.inference as paddle_infer
infer_model = get_model_path(model_path)
infer_params = get_model_path(params_path)
config = paddle_infer.Config(infer_model, infer_params)
self.predictor = paddle_infer.create_predictor(config)
self.input_names = self.predictor.get_input_names()
self.output_names = self.predictor.get_output_names()
self.labels = self.parse_labes(get_data_path(label_path))
self.model_type = model_type
def predict(self, img):
if self.preprocess is not None:
inputs = self.preprocess(img)
else:
inputs = img
for input_name in self.input_names:
input_tensor = self.predictor.get_input_handle(input_name)
input_tensor.copy_from_cpu(inputs[input_name])
self.predictor.run()
outputs = []
for output_idx in range(len(self.output_names)):
output_tensor = self.predictor.get_output_handle(
self.output_names[output_idx])
outputs.append(output_tensor.copy_to_cpu())
if self.postprocess is not None:
output_data = self.postprocess(outputs)
else:
output_data = outputs
return output_data
def preprocess(self, img):
img = cv2.resize(img, (224, 224))
scale = 1.0 / 255.0
mean = 0.5
std = 0.5
img = (img.astype('float32') * scale - mean) / std
img = img[np.newaxis, :, :, :]
img = img.transpose((0, 3, 1, 2))
return {'x': img}
@staticmethod
def parse_labes(label_path):
with open(label_path, 'r') as f:
labels = []
for line in f:
if len(line) < 2:
continue
label = line.strip().split(',')[0].split(' ')[2]
labels.append(label)
return labels
@staticmethod
def softmax(x, epsilon=1e-6):
exp_x = np.exp(x)
sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon)
return sfm
def postprocess(self, logits):
pred = np.array(logits).squeeze()
pred = self.softmax(pred)
class_idx = pred.argsort()[::-1]
return pred[class_idx[:5]], np.array(self.labels)[class_idx[:5]]
gradio
opencv-python
paddlepaddle
PyYAML
shapely
scipy
Cython
numpy
setuptools
pillow
tqdm
\ No newline at end of file
# 1. 推理Benchmark
## 1.1 软硬件环境
- PLSC-ViT模型推理采用GPU的型号为A100,不同的尺度的模型采用了单机8卡或是4机32卡。
## 1.2 数据集
- 测试使用的数据集为ImageNet.
## 1.3 指标
| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |
| --- | --- | --- | --- | --- | --- | --- |
| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 |
| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 |
| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | |
|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 |
# 2. 相关使用说明
https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md
# 1. Benchmark
## 1.1 Environment
- We train the ViT on 1 node with 8 A100 gpus or 4 nodes with 32 A100 gpus.
## 1.2 DataSet
- We train the ViT on ImageNet.
## 1.3 Benchmark
| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |
| --- | --- | --- | --- | --- | --- | --- |
| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 |
| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 |
| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | |
|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 |
# 2. Reference
https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md
# 模型列表
|模型名称|模型简介|模型配置|预训练checkpoint下载地址|
| --- | --- | --- | --- |
| ViT-B_16_224 |输入size为224,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml) |[download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) |
| ViT-B_16_384 |输入size为384,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_384_ft_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) |
| ViT-L_16_224 |输入size为224,layers=24|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_224_in21k_4n32c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) |
| ViT-L_16_384 |输入size为384,layers=32|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_384_in1k_ft_4n32c_dp_fp16o2.yaml) | [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) |
# Model List
|Model Name|Introduction|Config|Pretrained checkpoint Download|
| --- | --- | --- | --- |
| ViT-B_16_224 |input_size=224,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml) |[download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) |
| ViT-B_16_384 |input_size=384,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_384_ft_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) |
| ViT-L_16_224 |input_size=224,layers=24|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_224_in21k_4n32c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) |
| ViT-L_16_384 |input_size=384,layers=32|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_384_in1k_ft_4n32c_dp_fp16o2.yaml) | [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) |
---
Model_Info:
name: "PLSC-ViT"
description: "PaddlePaddle 重新实现 Google 官方 Repo 中的 Vision Transformer 算法 《An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale》"
description_en: "PaddlePaddle reimplementation of Google's repository for the Vision Transformer model that was released with the paper An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale."
update_time:
icon: "https://plsc.bj.bcebos.com/assets/modelcenter-icon.png"
from_repo: "PLSC"
Task:
- tag: 计算机视觉
tag_en: Computer Vision
sub_tag: 图像分类
sub_tag_en: Image Classification
Example:
- tag:
tag_en:
sub_tag:
sub_tag_en:
title:
title_en:
url:
url_en:
Datasets: ImageNet 1K, ImageNet 21K
Publisher: Baidu
License: Apache 2.0
Paper:
- title: "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale"
url: https://arxiv.org/abs/2010.11929
IfTraining: 1
IfOnlineDemo: 1
{
"cells": [
{
"cell_type": "markdown",
"id": "ae69ce68",
"metadata": {},
"source": [
"## 1. PLSC-ViT模型简介\n"
]
},
{
"cell_type": "markdown",
"id": "35485bc6",
"metadata": {},
"source": [
"PLSC-ViT实现了基于Transformer的视觉分类模型。ViT对图像进行切分成patch,之后基于patch拉平的sequence进行线性embedding,并且添加了position embeddings和classfication token,然后将patch序列输入到标准的transformer编码器,最终经过一个MLP进行分类。模型结构如下,\n",
"\n",
"![Figure 1 from paper](https://github.com/google-research/vision_transformer/raw/main/vit_figure.png)\n"
]
},
{
"cell_type": "markdown",
"id": "97e174e6",
"metadata": {},
"source": [
"## 2. 模型效果 "
]
},
{
"cell_type": "markdown",
"id": "78137a72",
"metadata": {},
"source": [
"| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n",
"| --- | --- | --- | --- | --- | --- | --- |\n",
"| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 |\n",
"| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 |\n",
"| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | |\n",
"|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 |"
]
},
{
"cell_type": "markdown",
"id": "ace3c48d",
"metadata": {},
"source": [
"## 3. 模型如何使用"
]
},
{
"cell_type": "markdown",
"id": "a97a5f56",
"metadata": {},
"source": [
"### 3.1 安装PLSC"
]
},
{
"cell_type": "markdown",
"id": "492fa769-2fe0-4220-b6d9-bbc32f8cca10",
"metadata": {},
"source": [
"```\n",
"git clone https://github.com/PaddlePaddle/PLSC.git\n",
"cd /path/to/PLSC/\n",
"# [optional] pip install -r requirements.txt\n",
"python setup.py develop\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "6b22824d",
"metadata": {},
"source": [
"### 3.2 模型训练"
]
},
{
"cell_type": "markdown",
"id": "d68ca5fb",
"metadata": {},
"source": [
"1. 进入任务目录\n",
"\n",
"```\n",
"cd task/classification/vit\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "9048df01",
"metadata": {},
"source": [
"2. 准备数据\n",
"\n",
"将数据整理成以下格式:\n",
"```text\n",
"dataset/\n",
"└── ILSVRC2012\n",
" ├── train\n",
" ├── val\n",
" ├── train_list.txt\n",
" └── val_list.txt\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "bea743ea",
"metadata": {},
"source": [
"3. 执行训练命令\n",
"\n",
"```shell\n",
"export PADDLE_NNODES=1\n",
"export PADDLE_MASTER=\"127.0.0.1:12538\"\n",
"export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n",
"\n",
"python -m paddle.distributed.launch \\\n",
" --nnodes=$PADDLE_NNODES \\\n",
" --master=$PADDLE_MASTER \\\n",
" --devices=$CUDA_VISIBLE_DEVICES \\\n",
" plsc-train \\\n",
" -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml\n",
"```\n",
"\n",
"更多模型的训练教程可参考文档:[ViT训练文档](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md)"
]
},
{
"cell_type": "markdown",
"id": "186a0c17",
"metadata": {},
"source": [
"### 3.3 模型推理"
]
},
{
"cell_type": "markdown",
"id": "e97c527c",
"metadata": {},
"source": [
"1. 下载预训练模型\n",
"\n",
"```shell\n",
"mkdir -p pretrained/vit/ViT_base_patch16_224/\n",
"wget -O ./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224.pdparams https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "a07c6549",
"metadata": {},
"source": [
"2. 导出推理模型\n",
"\n",
"```shell\n",
"plsc-export -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml -o Global.pretrained_model=./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224 -o Model.data_format=NCHW -o FP16.level=O0\n",
"```\n"
]
},
{
"cell_type": "markdown",
"id": "d375934d",
"metadata": {},
"source": [
"## 4. 相关论文及引用信息\n"
]
},
{
"cell_type": "markdown",
"id": "29f05b07-d323-45e4-b00d-0728eafb5af7",
"metadata": {},
"source": [
"```text\n",
"@article{dosovitskiy2020,\n",
" title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},\n",
" author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},\n",
" journal={arXiv preprint arXiv:2010.11929},\n",
" year={2020}\n",
"}\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "markdown",
"id": "ae69ce68",
"metadata": {},
"source": [
"## 1. PLSC-ViT Introduction\n"
]
},
{
"cell_type": "markdown",
"id": "35485bc6",
"metadata": {},
"source": [
"PLSC-ViT reimplemented Google's repository for the ViT model. The overview of the model is as follows. The input image is splited into fixed-size patches, then linear projection and position embeddings are applied. The resulting sequence are feed into a standard Transformer encoder. In order to perform classification, the standard approach of adding an extra learnable \"classification token\" is utilized to the sequence. \n",
"\n",
"![Figure 1 from paper](https://github.com/google-research/vision_transformer/raw/main/vit_figure.png)\n"
]
},
{
"cell_type": "markdown",
"id": "97e174e6",
"metadata": {},
"source": [
"## 2. Model Effects and Application Scenarios"
]
},
{
"cell_type": "markdown",
"id": "67ae978f",
"metadata": {},
"source": [
"| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n",
"| --- | --- | --- | --- | --- | --- | --- |\n",
"| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 |\n",
"| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 |\n",
"| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | |\n",
"|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 |"
]
},
{
"cell_type": "markdown",
"id": "ace3c48d",
"metadata": {},
"source": [
"## 3. How to use the Model"
]
},
{
"cell_type": "markdown",
"id": "186a0c17",
"metadata": {},
"source": [
"### 3.1 Install PLSC\n",
"\n",
"```shell\n",
"git clone https://github.com/PaddlePaddle/PLSC.git\n",
"cd /path/to/PLSC/\n",
"# [optional] pip install -r requirements.txt\n",
"python setup.py develop\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "6b22824d",
"metadata": {},
"source": [
"### 3.2 Model Training"
]
},
{
"cell_type": "markdown",
"id": "a562bf23",
"metadata": {},
"source": [
"1. Enter into the task directory\n",
"\n",
"```shell\n",
"cd task/classification/vit\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "de109245",
"metadata": {},
"source": [
"2. Prepare the data\n",
"\n",
"Organize the data into the following format:\n",
"\n",
"```text\n",
"dataset/\n",
"└── ILSVRC2012\n",
" ├── train\n",
" ├── val\n",
" ├── train_list.txt\n",
" └── val_list.txt\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "ec78efdf",
"metadata": {},
"source": [
"3. Run the command\n",
"\n",
"```shell\n",
"export PADDLE_NNODES=1\n",
"export PADDLE_MASTER=\"127.0.0.1:12538\"\n",
"export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n",
"\n",
"python -m paddle.distributed.launch \\\n",
" --nnodes=$PADDLE_NNODES \\\n",
" --master=$PADDLE_MASTER \\\n",
" --devices=$CUDA_VISIBLE_DEVICES \\\n",
" plsc-train \\\n",
" -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml\n",
"```\n",
"\n",
"More courses about model training can be learned here [ViT readme](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md)"
]
},
{
"cell_type": "markdown",
"id": "05ba38c3",
"metadata": {},
"source": [
"### 3.3 Model Inference"
]
},
{
"cell_type": "markdown",
"id": "7a3ce1ab",
"metadata": {},
"source": [
"1. Download pretrained model\n",
"\n",
"```shell\n",
"mkdir -p pretrained/vit/ViT_base_patch16_224/\n",
"wget -O ./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224.pdparams https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "cff5ac83",
"metadata": {},
"source": [
"2. Export model for inference\n",
"\n",
"```shell\n",
"plsc-export -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml -o Global.pretrained_model=./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224 -o Model.data_format=NCHW -o FP16.level=O0\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "d375934d",
"metadata": {},
"source": [
"## 4. Related papers and citations\n",
"\n",
"```text\n",
"@article{dosovitskiy2020,\n",
" title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},\n",
" author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},\n",
" journal={arXiv preprint arXiv:2010.11929},\n",
" year={2020}\n",
"}\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册