diff --git a/demo/finetune_mrc_dygraph.py b/demo/finetune_mrc_dygraph.py
index de0668d5b33cfbfbba14c0d06dbe37f468f69fa4..13d1e8b794857ea440c45de72bef7113340c3d66 100644
--- a/demo/finetune_mrc_dygraph.py
+++ b/demo/finetune_mrc_dygraph.py
@@ -82,8 +82,8 @@ def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokeniz
     model = D.parallel.DataParallel(model, ctx)
 
     max_steps = len(train_features) * args.epoch // args.bsz
-    opt = AdamW(learning_rate=args.lr, parameter_list=model.parameters(), weight_decay=args.wd)
     g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
+    opt = AdamW(learning_rate=args.lr, parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip)
 
     train_dataset = train_dataset \
             .repeat() \
@@ -97,7 +97,7 @@ def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokeniz
         scaled_loss = model.scale_loss(loss)
         scaled_loss.backward()
         model.apply_collective_grads()
-        opt.minimize(scaled_loss, grad_clip=g_clip)
+        opt.minimize(scaled_loss)
         model.clear_gradients()
         if D.parallel.Env().dev_id == 0 and step % 10 == 0:
             log.debug('[step %d] train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
diff --git a/demo/finetune_ner_dygraph.py b/demo/finetune_ner_dygraph.py
index cf1615a66ba6297486ba41d2ac35c0709480a3c0..4395d6ac046973d2dadca2d1aa56e9bd4dddd8ca 100644
--- a/demo/finetune_ner_dygraph.py
+++ b/demo/finetune_ner_dygraph.py
@@ -26,7 +26,6 @@ from functools import reduce, partial
 import numpy as np
 import multiprocessing
 import pickle
-import jieba
 import logging
 
 from sklearn.metrics import f1_score
diff --git a/demo/finetune_sentiment_analysis_dygraph.py b/demo/finetune_sentiment_analysis_dygraph.py
index 17ea38483d7c327b3296b42cb3e6317d03f95289..7a01899b1679d59e11eae40b5d67f80e0d7a51e8 100644
--- a/demo/finetune_sentiment_analysis_dygraph.py
+++ b/demo/finetune_sentiment_analysis_dygraph.py
@@ -95,12 +95,14 @@ if __name__ == '__main__':
             dev_ds.data_shapes = shapes
             dev_ds.data_types = types
 
+            g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
             opt = AdamW(learning_rate=LinearDecay(
                 args.lr, 
                 int(args.warmup_proportion * args.max_steps), args.max_steps), 
                 parameter_list=model.parameters(), 
-                weight_decay=args.wd)
-            g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
+                weight_decay=args.wd,
+                grad_clip=g_clip)
+
             for epoch in range(args.epoch):
                 for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
                     ids, sids, label = d
@@ -108,7 +110,7 @@ if __name__ == '__main__':
                     loss.backward()
                     if step % 10 == 0:
                         log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr()))
-                    opt.minimize(loss, grad_clip=g_clip)
+                    opt.minimize(loss)
                     model.clear_gradients()
                     if step % 100 == 0:
                         acc = []
diff --git a/distill/README.md b/distill/README.md
index 586ac842fdce998f7980062dc4dbc820c2f55f43..78c3e734ac14f6d4c17a576c0aea3c35f191d7d9 100644
--- a/distill/README.md
+++ b/distill/README.md
@@ -5,7 +5,6 @@
 * [效果验证](#效果验证)
     * [Case#1 用户提供“无标注数据”](#case1)
     * [Case#2 用户未提供“无标注数据”](#case2)
-* [FAQ](#faq)
 
 # ERNIE Slim 数据蒸馏
 在ERNIE强大的语义理解能力背后，是需要同样强大的算力才能支撑起如此大规模模型的训练和预测。很多工业应用场景对性能要求较高，若不能有效压缩则无法实际应用。
@@ -37,7 +36,7 @@
 
 # 使用教程
 
-我们采用上述3种增强策略制作了chnsenticorp的增强数据：增强后的数据为原训练数据的10倍(96000行)，可以从[这里](https://ernie.bj.bcebos.com/distill_data.tar.gz)下载。即可执行下面的脚本开始蒸馏。
+我们采用上述3种增强策略制作了chnsenticorp的增强数据：增强后的数据为原训练数据的10倍(96000行)，可以从[这里](https://ernie-github.cdn.bcebos.com/data-chnsenticorp-distill.tar.gz)下载。即可执行下面的脚本开始蒸馏。
 
 ```shell
 python ./distill/distill.py
@@ -64,8 +63,3 @@ python ./distill/distill.py
 |非ERNIE基线（LSTM）|91.2%|
 |**+ 数据蒸馏**|93.9%|
 
-# FAQ
-
-### FQA1: 预测同时蒸馏报错：`Client call failed`
-
-终端打印的错误是client的日志，server端的日志在前面。一般来说可能是server显存超限导致。这种时候需要在student模型finetune的脚本中使用`--server_batch_size ` 显示控制请求服务的batch大小。
diff --git a/distill/distill.py b/distill/distill.py
index 69ad7617cbc9911559265b7890175e4c392d4356..534d8955a4fda32fc374c8e392b1f4b8719d6a12 100644
--- a/distill/distill.py
+++ b/distill/distill.py
@@ -30,12 +30,13 @@ from ernie.optimization import AdamW, LinearDecay
 
 # 本例子采用chnsenticorp中文情感识别任务作为示范；并且事先通过数据增强扩充了蒸馏所需的无监督数据
 # 
-# 请从“”下载数据；并数据存放在 ./chnsenticorp-data/
+# 下载数据；并存放在 ./chnsenticorp-data/
 # 数据分为3列：原文；空格切词；情感标签
 # 其中第一列为ERNIE的输入；第二列为BoW词袋模型的输入
 # 事先统计好的BoW 词典在 ./chnsenticorp-data/vocab.bow.txt
 
 # 定义finetune teacher模型所需要的超参数
+DATA_DIR='./chnsenticorp-data/'
 SEQLEN=256
 BATCH=32
 EPOCH=10
@@ -43,7 +44,7 @@ LR=5e-5
 
 tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
 
-student_vocab = {i.strip(): l for l, i in enumerate(open('./chnsenticorp-data/vocab.bow.txt').readlines())}
+student_vocab = {i.strip(): l for l, i in enumerate(open(os.path.join(DATA_DIR, 'vocab.bow.txt')).readlines())}
 
 def space_tokenizer(i):
     return i.decode('utf8').split()
@@ -63,11 +64,17 @@ def map_fn(seg_a, seg_a_student, label):
     return seg_a_student, sentence, segments, label
 
 
-train_ds = feature_column.build_dataset('train', data_dir='./chnsenticorp-data/train/', shuffle=True, repeat=False, use_gz=False)                                 .map(map_fn)                                 .padded_batch(BATCH,)
+train_ds = feature_column.build_dataset('train', data_dir=os.path.join(DATA_DIR, 'train/'), shuffle=True, repeat=False, use_gz=False) \
+    .map(map_fn) \
+    .padded_batch(BATCH)
 
-train_ds_unlabel = feature_column.build_dataset('train-da', data_dir='./chnsenticorp-data/train-data-augmented/', shuffle=True, repeat=False, use_gz=False)                                 .map(map_fn)                                 .padded_batch(BATCH,)
+train_ds_unlabel = feature_column.build_dataset('train-da', data_dir=os.path.join(DATA_DIR, 'train-data-augmented/'), shuffle=True, repeat=False, use_gz=False) \
+    .map(map_fn) \
+    .padded_batch(BATCH)
 
-dev_ds = feature_column.build_dataset('dev', data_dir='./chnsenticorp-data/dev/', shuffle=False, repeat=False, use_gz=False)                                 .map(map_fn)                                 .padded_batch(BATCH,)
+dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(DATA_DIR, 'dev/'), shuffle=False, repeat=False, use_gz=False) \
+    .map(map_fn) \
+    .padded_batch(BATCH,) 
 
 shapes = ([-1,SEQLEN],[-1,SEQLEN], [-1, SEQLEN], [-1])
 types = ('int64', 'int64', 'int64', 'int64')
@@ -99,15 +106,15 @@ def evaluate_teacher(model, dataset):
 teacher_model = ErnieModelForSequenceClassification.from_pretrained('ernie-1.0', num_labels=2)
 teacher_model.train()
 if not os.path.exists('./teacher_model.pdparams'):
-    opt = AdamW(learning_rate=LinearDecay(LR, 9600*EPOCH*0.1/BATCH, 9600*EPOCH/BATCH), parameter_list=teacher_model.parameters(), weight_decay=0.01)
     g_clip = F.clip.GradientClipByGlobalNorm(1.0)
+    opt = AdamW(learning_rate=LinearDecay(LR, 9600*EPOCH*0.1/BATCH, 9600*EPOCH/BATCH), parameter_list=teacher_model.parameters(), weight_decay=0.01, grad_clip=g_clip)
     for epoch in range(EPOCH):
         for step, (ids_student, ids, sids, labels) in enumerate(train_ds.start(place)):
             loss, logits = teacher_model(ids, labels=labels)
             loss.backward()
             if step % 10 == 0:
                 print('[step %03d] teacher train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
-            opt.minimize(loss, grad_clip=g_clip)
+            opt.minimize(loss)
             teacher_model.clear_gradients()
             if step % 100 == 0:
                 f1 = evaluate_teacher(teacher_model, dev_ds)
@@ -199,32 +206,34 @@ def KL(pred, target):
     
 teacher_model.eval()
 model = BOW()
-opt = AdamW(learning_rate=LR, parameter_list=model.parameters(), weight_decay=0.01)
 g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
+opt = AdamW(learning_rate=LR, parameter_list=model.parameters(), weight_decay=0.01, grad_clip=g_clip)
 model.train()
 for epoch in range(EPOCH):
-    for step, (ids_student, ids, sids, _ ) in enumerate(train_ds.start(place)):
+    for step, (ids_student, ids, sids, label) in enumerate(train_ds.start(place)):
         _, logits_t = teacher_model(ids, sids) # teacher 模型输出logits
         logits_t.stop_gradient=True
         _, logits_s = model(ids_student) # student 模型输出logits
-        loss = KL(logits_s, logits_t)    # 由KL divergence度量两个分布的距离
+        loss_ce, _ = model(ids_student, labels=label)
+        loss_kd = KL(logits_s, logits_t)    # 由KL divergence度量两个分布的距离
+        loss = loss_ce + loss_kd
         loss.backward()
         if step % 10 == 0:
-            print('[step %03d] 无监督 train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
-        opt.minimize(loss, grad_clip=g_clip)
+            print('[step %03d] distill train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
+        opt.minimize(loss)
         model.clear_gradients()
     f1 = evaluate_student(model, dev_ds)
-    print('f1 %.5f' % f1)
-
-    for step, (ids_student, ids, sids, label) in enumerate(train_ds.start(place)):
-        loss, _ = model(ids_student, labels=label)
-        loss.backward()
-        if step % 10 == 0:
-            print('[step %03d] 监督 train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
-        opt.minimize(loss, grad_clip=g_clip)
-        model.clear_gradients()
-
-    f1 = evaluate_student(model, dev_ds)
-    print('f1 %.5f' % f1)
-
+    print('student f1 %.5f' % f1)
+
+# 最后再加一轮hard label训练巩固结果
+for step, (ids_student, ids, sids, label) in enumerate(train_ds.start(place)):
+    loss, _ = model(ids_student, labels=label)
+    loss.backward()
+    if step % 10 == 0:
+        print('[step %03d] train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
+    opt.minimize(loss)
+    model.clear_gradients()
+
+f1 = evaluate_student(model, dev_ds)
+print('final f1 %.5f' % f1)