!1 从https://github.com/intel/memory-optimizer/tree/master/kernel_module引入memory-scan仓库的初始代码

Merge pull request !1 from yanxiaodan/scan-0810

!1 从https://github.com/intel/memory-optimizer/tree/master/kernel_module引入memory-scan仓库的初始代码
Merge pull request !1 from yanxiaodan/scan-0810
1087d6c6 · openeuler-ci-bot · Gitee · 8e958a46 · 33c152dc · 1087d6c6
11 changed file
--- a/Kbuild
+++ b/Kbuild
+obj-m := kvm_ept_idle.o
+kvm_ept_idle-y := ept_idle.o ept_idle_native_pagewalk.o tlb_flush.o
--- a/Makefile
+++ b/Makefile
+include Kbuild
+
+# KERNEL_SRC_DIR=/lib/modules/$(shell uname -r)/build
+# MODULE_DIR=$(shell pwd)
+
+default:
+	$(MAKE) -C $(KERNEL_SRC_DIR) M=$(MODULE_DIR) modules
+
+clean:
+	rm -f *.o *.ko *.mod.c modules.order Module.symvers
+
--- a/README.en.md
+++ b/README.en.md
 # memory-scan

 #### Description
-A kernel module for scaning page table of process/VMs
-
-#### Software Architecture
-Software architecture description
+memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel_module. memory-scan is a kernel module for scaning page table of process/VMs.

 #### Installation

-1.  xxxx
-2.  xxxx
-3.  xxxx
-
-#### Instructions
-
-1.  xxxx
-2.  xxxx
-3.  xxxx
+OpenEuler users can use memory-scan:
+1.  make
+2.  insmod memory_scan.ko

 #### Contribution


--- a/README.md
+++ b/README.md
 # memory-scan

 #### 介绍
-A kernel module for scaning page table of process/VMs
-
-#### 软件架构
-软件架构说明

+memory-scan源于memory-optimizer（https://github.com/intel/memory-optimizer/tree/master/kernel_module）。memory-scan是一个用于扫描进程或虚拟机的内存页面访问情况的内核模块。

 #### 安装教程

-1.  xxxx
-2.  xxxx
-3.  xxxx
-
-#### 使用说明
-
-1.  xxxx
-2.  xxxx
-3.  xxxx
+OpenEuler用户可以通过以下方法使用memory-scan：
+1. make
+2. insmod memory_scan.ko

 #### 参与贡献


--- a/ept_idle.c
+++ b/ept_idle.c
--- a/ept_idle.h
+++ b/ept_idle.h
+#ifndef _EPT_IDLE_H
+#define _EPT_IDLE_H
+
+#include "ept_idle_common.h"
+
+#define SCAN_HUGE_PAGE		O_NONBLOCK	/* only huge page */
+#define SCAN_SKIM_IDLE		O_NOFOLLOW	/* stop on PMD_IDLE_PTES */
+#define SCAN_DIRTY_PAGE		O_NOATIME   /* report pte/pmd dirty bit */
+
+enum ProcIdlePageType {
+	PTE_ACCESSED,	/* 4k page */
+	PMD_ACCESSED,	/* 2M page */
+	PUD_PRESENT,	/* 1G page */
+
+	PTE_DIRTY,
+	PMD_DIRTY,
+
+	PTE_IDLE,
+	PMD_IDLE,
+	PMD_IDLE_PTES,	/* all PTE idle */
+
+	PTE_HOLE,
+	PMD_HOLE,
+
+	PIP_CMD,
+
+	IDLE_PAGE_TYPE_MAX
+};
+
+#define PIP_TYPE(a)		(0xf & (a >> 4))
+#define PIP_SIZE(a)		(0xf & a)
+#define PIP_COMPOSE(type, nr)	((type << 4) | nr)
+
+#define PIP_CMD_SET_HVA		PIP_COMPOSE(PIP_CMD, 0)
+
+#define _PAGE_BIT_EPT_ACCESSED	8
+#define _PAGE_BIT_EPT_DIRTY		9
+#define _PAGE_EPT_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED)
+#define _PAGE_EPT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY)
+
+#define _PAGE_EPT_PRESENT	(_AT(pteval_t, 7))
+
+static inline int ept_pte_present(pte_t a)
+{
+	return pte_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pmd_present(pmd_t a)
+{
+	return pmd_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pud_present(pud_t a)
+{
+	return pud_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_p4d_present(p4d_t a)
+{
+	return p4d_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pgd_present(pgd_t a)
+{
+	return pgd_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pte_accessed(pte_t a)
+{
+	return pte_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_pmd_accessed(pmd_t a)
+{
+	return pmd_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_pud_accessed(pud_t a)
+{
+	return pud_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_p4d_accessed(p4d_t a)
+{
+	return p4d_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_pgd_accessed(pgd_t a)
+{
+	return pgd_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+extern struct file_operations proc_ept_idle_operations;
+
+#define EPT_IDLE_KBUF_FULL	1
+#define EPT_IDLE_BUF_FULL	2
+#define EPT_IDLE_BUF_MIN	(sizeof(uint64_t) * 2 + 3)
+
+#define EPT_IDLE_KBUF_SIZE	8000
+
+#define IDLE_PAGE_SET_PID   _IOW(0x1, 0x1, pid_t)
+
+struct ept_idle_ctrl {
+	struct mm_struct *mm;
+	struct kvm *kvm;
+
+	uint8_t kpie[EPT_IDLE_KBUF_SIZE];
+	int pie_read;
+	int pie_read_max;
+
+	void __user *buf;
+	int buf_size;
+	int bytes_copied;
+
+	unsigned long next_hva;		/* GPA for EPT; VA for PT */
+	unsigned long gpa_to_hva;
+	unsigned long restart_gpa;
+	unsigned long last_va;
+
+	unsigned int flags;
+};
+
+#endif
--- a/ept_idle_common.h
+++ b/ept_idle_common.h
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _EPT_IDLE_COMMON_H
+#define _EPT_IDLE_COMMON_H
+
+/* Fix leak of 5 level paging supporting on old kernel*/
+#ifndef CONFIG_PGTABLE_LEVELS
+  #define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
+#else
+  #if CONFIG_PGTABLE_LEVELS < 4
+    #define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
+  #endif // #if CONFIG_PGTABLE_LEVELS < 4
+#endif // #ifndef CONFIG_PGTABLE_LEVELS
+
+#ifdef EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
+
+#define p4d_t                    pgd_t
+#define p4d_flags                pgd_flags
+#define p4d_offset(pgd, start)   (pgd)
+#define p4d_addr_end(addr, end)  (end)
+#define p4d_present(p4d)         1
+#define p4d_ERROR(p4d)           do { } while(0)
+#define p4d_clear                pgd_clear
+#define p4d_none(p4d)            0
+#define p4d_bad(p4d)             0
+#define p4d_clear_bad            pgd_clear_bad
+#endif
+
+#ifndef pgd_offset_pgd
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+#endif
+
+
+#endif
--- a/ept_idle_native_pagewalk.c
+++ b/ept_idle_native_pagewalk.c
+// SPDX-License-Identifier: GPL-2.0
+// Copied from kernel mm/pagewalk.c, modified by yuan.yao@intel.com
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include "ept_idle_common.h"
+
+#ifdef CONFIG_HUGETLB_PAGE
+int pmd_huge(pmd_t pmd)
+{
+	return !pmd_none(pmd) &&
+		(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
+}
+
+int pud_huge(pud_t pud)
+{
+	return !!(pud_val(pud) & _PAGE_PSE);
+}
+
+/*
+ * ept_idle_huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table or swap entry (PUD or PMD) for
+ * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
+pte_t *ept_idle_huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		return NULL;
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return NULL;
+
+	pud = pud_offset(p4d, addr);
+	if (sz != PUD_SIZE && pud_none(*pud))
+		return NULL;
+	/* hugepage or swap? */
+	if (pud_huge(*pud) || !pud_present(*pud))
+		return (pte_t *)pud;
+
+	pmd = pmd_offset(pud, addr);
+	if (sz != PMD_SIZE && pmd_none(*pmd))
+		return NULL;
+	/* hugepage or swap? */
+	if (pmd_huge(*pmd) || !pmd_present(*pmd))
+		return (pte_t *)pmd;
+
+	return NULL;
+}
+
+#else // #ifdef CONFIG_HUGETLB_PAGE
+#define pud_huge(x) 0
+#define pmd_huge(x) 0
+#define ept_idle_huge_pte_offset(mm, address, sz)	0
+#endif
+
+#ifndef VM_BUG_ON_VMA
+#define VM_BUG_ON_VMA(cond, vma)					\
+	do {								\
+		if (unlikely(cond)) {					\
+			BUG();						\
+		}							\
+	} while (0)
+
+#endif
+
+
+#ifndef VM_BUG_ON_MM
+#define VM_BUG_ON_MM VM_BUG_ON_VMA
+#endif
+
+static inline int ept_idle_p4d_none_or_clear_bad(p4d_t *p4d)
+{
+	if (p4d_none(*p4d))
+		return 1;
+	if (unlikely(p4d_bad(*p4d))) {
+		p4d_clear_bad(p4d);
+		return 1;
+	}
+	return 0;
+}
+
+
+static inline spinlock_t *ept_idle_pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
+{
+	spinlock_t *ptl;
+
+	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
+
+	ptl = pud_lock(vma->vm_mm, pud);
+	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+		return ptl;
+	spin_unlock(ptl);
+	return NULL;
+}
+
+void p4d_clear_bad(p4d_t *p4d)
+{
+	p4d_ERROR(*p4d);
+	p4d_clear(p4d);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+	pmd_ERROR(*pmd);
+	pmd_clear(pmd);
+}
+
+#ifdef _EPT_IDLE_SPLIT_PMD_
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pte_t *pte;
+	int err = 0;
+
+	pte = pte_offset_map(pmd, addr);
+	for (;;) {
+		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+		if (err)
+		       break;
+		addr += PAGE_SIZE;
+		if (addr == end)
+			break;
+		pte++;
+	}
+
+	pte_unmap(pte);
+	return err;
+}
+#endif
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	int err = 0;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+#ifdef _EPT_IDLE_SPLIT_PMD_
+ again:
+#endif
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(*pmd) || !walk->vma) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		/*
+		 * This implies that each ->pmd_entry() handler
+		 * needs to know about pmd_trans_huge() pmds
+		 */
+		if (walk->pmd_entry)
+			err = walk->pmd_entry(pmd, addr, next, walk);
+		if (err)
+			break;
+
+#ifdef _EPT_IDLE_SPLIT_PMD_
+		/*
+		 * Check this here so we only break down trans_huge
+		 * pages when we _need_ to
+		 */
+		if (!walk->pte_entry)
+			continue;
+
+		split_huge_pmd(walk->vma, pmd, addr);
+		if (pmd_trans_unstable(pmd))
+			goto again;
+
+		err = walk_pte_range(pmd, addr, next, walk);
+		if (err)
+			break;
+#endif
+	} while (pmd++, addr = next, addr != end);
+
+	return err;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pud_t *pud;
+	unsigned long next;
+	int err = 0;
+
+	pud = pud_offset(p4d, addr);
+	do {
+#ifdef _EPT_IDLE_SPLIT_PUD_
+ again:
+#endif
+		next = pud_addr_end(addr, end);
+		if (pud_none(*pud) || !walk->vma) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+
+		if (walk->pud_entry) {
+			spinlock_t *ptl = ept_idle_pud_trans_huge_lock(pud, walk->vma);
+
+			if (ptl) {
+				err = walk->pud_entry(pud, addr, next, walk);
+				spin_unlock(ptl);
+				if (err)
+					break;
+				continue;
+			}
+		}
+#ifdef _EPT_IDLE_SPLIT_PUD_
+		split_huge_pud(walk->vma, pud, addr);
+		if (pud_none(*pud))
+			goto again;
+#endif
+
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_pmd_range(pud, addr, next, walk);
+		if (err)
+			break;
+
+	} while (pud++, addr = next, addr != end);
+
+	return err;
+}
+
+static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	int err = 0;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (ept_idle_p4d_none_or_clear_bad(p4d)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_pud_range(p4d, addr, next, walk);
+		if (err)
+			break;
+	} while (p4d++, addr = next, addr != end);
+
+	return err;
+}
+
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	int err = 0;
+
+	pgd = pgd_offset(walk->mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_p4d_range(pgd, addr, next, walk);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	return err;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+				       unsigned long end)
+{
+	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+	return boundary < end ? boundary : end;
+}
+
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
+			      struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long next;
+	unsigned long hmask = huge_page_mask(h);
+	unsigned long sz = huge_page_size(h);
+	pte_t *pte;
+	int err = 0;
+
+	do {
+		next = hugetlb_entry_end(h, addr, end);
+		pte = ept_idle_huge_pte_offset(walk->mm, addr & hmask, sz);
+
+		if (pte)
+			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+		else if (walk->pte_hole)
+			err = walk->pte_hole(addr, next, walk);
+
+		if (err)
+			break;
+	} while (addr = next, addr != end);
+
+	return err;
+}
+
+#else /* CONFIG_HUGETLB_PAGE */
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
+			      struct mm_walk *walk)
+{
+	return 0;
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+
+	if (walk->test_walk)
+		return walk->test_walk(start, end, walk);
+
+	/*
+	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
+	 * range, so we don't walk over it as we do for normal vmas. However,
+	 * Some callers are interested in handling hole range and they don't
+	 * want to just ignore any single address range. Such users certainly
+	 * define their ->pte_hole() callbacks, so let's delegate them to handle
+	 * vma(VM_PFNMAP).
+	 */
+	if (vma->vm_flags & VM_PFNMAP) {
+		int err = 1;
+		if (walk->pte_hole)
+			err = walk->pte_hole(start, end, walk);
+		return err ? err : 1;
+	}
+	return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	int err = 0;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (vma && is_vm_hugetlb_page(vma)) {
+		if (walk->hugetlb_entry)
+			err = walk_hugetlb_range(start, end, walk);
+	} else
+		err = walk_pgd_range(start, end, walk);
+
+	return err;
+}
+
+/**
+ * walk_page_range - walk page table with caller specific callbacks
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @walk: mm_walk structure defining the callbacks and the target address space
+ *
+ * Recursively walk the page table tree of the process represented by @walk->mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *
+ *  - 0  : succeeded to handle the current entry, and if you don't reach the
+ *         end address yet, continue to walk.
+ *  - >0 : succeeded to handle the current entry, and return to the caller
+ *         with caller specific value.
+ *  - <0 : failed to handle the current entry, and return to the caller
+ *         with error code.
+ *
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
+ *
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @walk->private should be helpful.
+ *
+ * Locking:
+ *   Callers of walk_page_range() and walk_page_vma() should hold
+ *   @walk->mm->mmap_sem, because these function traverse vma list and/or
+ *   access to vma's data.
+ */
+int ept_idle_walk_page_range(unsigned long start, unsigned long end,
+		    struct mm_walk *walk)
+{
+	int err = 0;
+	unsigned long next;
+	struct vm_area_struct *vma;
+
+	if (start >= end)
+		return -EINVAL;
+
+	if (!walk->mm)
+		return -EINVAL;
+
+	VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
+
+	vma = find_vma(walk->mm, start);
+	do {
+		if (!vma) { /* after the last vma */
+			walk->vma = NULL;
+			next = end;
+		} else if (start < vma->vm_start) { /* outside vma */
+			walk->vma = NULL;
+			next = min(end, vma->vm_start);
+		} else { /* inside vma */
+			walk->vma = vma;
+			next = min(end, vma->vm_end);
+			vma = vma->vm_next;
+
+			err = walk_page_test(start, next, walk);
+			if (err > 0) {
+				/*
+				 * positive return values are purely for
+				 * controlling the pagewalk, so should never
+				 * be passed to the callers.
+				 */
+				err = 0;
+				continue;
+			}
+			if (err < 0)
+				break;
+		}
+		if (walk->vma || walk->pte_hole)
+			err = __walk_page_range(start, next, walk);
+		if (err)
+			break;
+	} while (start = next, start < end);
+	return err;
+}
--- a/ept_idle_native_pagewalk.h
+++ b/ept_idle_native_pagewalk.h
+#ifndef _EPT_IDLE_NATIVE_PAGEWALK_H
+#define _EPT_IDLE_NATIVE_PAGEWALK_H
+
+int ept_idle_walk_page_range(unsigned long start, unsigned long end,
+                             struct mm_walk *walk);
+
+#endif
--- a/tlb_flush.c
+++ b/tlb_flush.c
+#include "tlb_flush.h"
+
+
+/* copied from 4.20 kernel:
+ * See Documentation/x86/tlb.txt for details.  We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead.  Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+static unsigned long copied_tlb_single_page_flush_ceiling __read_mostly = 33;
+
+
+static bool copied_tlb_is_not_lazy(int cpu, void *data)
+{
+	return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+}
+
+
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen.  We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
+static void copied_flush_tlb_func_common(const struct flush_tlb_info *f,
+				  bool local, enum tlb_flush_reason reason)
+{
+	/*
+	 * We have three different tlb_gen values in here.  They are:
+	 *
+	 * - mm_tlb_gen:     the latest generation.
+	 * - local_tlb_gen:  the generation that this CPU has already caught
+	 *                   up to.
+	 * - f->new_tlb_gen: the generation that the requester of the flush
+	 *                   wants us to catch up to.
+	 */
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
+	/* This code cannot presently handle being reentered. */
+	VM_WARN_ON(!irqs_disabled());
+
+	/*
+	 * The init_mm is unexported variable, but we don't need
+	 * check this here for our case, we just want to flush
+	 * the TLB on remote CPU cores which is running the task
+	 * using f->mm as memory space
+	 */
+#if 0
+	if (unlikely(loaded_mm == &init_mm))
+		return;
+#else
+	if (unlikely(loaded_mm != f->mm)) {
+		return;
+	}
+#endif
+
+	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+		   loaded_mm->context.ctx_id);
+
+	/*
+	 * The caller of this function will set is_lazy to false explicitly
+	 * so we don't need handle this case, just skip this.
+	 */
+ #if 0
+	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
+		/*
+		 * We're in lazy mode.  We need to at least flush our
+		 * paging-structure cache to avoid speculatively reading
+		 * garbage into our TLB.  Since switching to init_mm is barely
+		 * slower than a minimal flush, just switch to init_mm.
+		 *
+		 * This should be rare, with native_flush_tlb_others skipping
+		 * IPIs to lazy TLB mode CPUs.
+		 */
+		switch_mm_irqs_off(NULL, &init_mm, NULL);
+		return;
+	}
+#endif
+
+	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+		/*
+		 * There's nothing to do: we're already up to date.  This can
+		 * happen if two concurrent flushes happen -- the first flush to
+		 * be handled can catch us all the way up, leaving no work for
+		 * the second flush.
+		 */
+		// trace_tlb_flush(reason, 0);
+		return;
+	}
+
+	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+	/*
+	 * If we get to this point, we know that our TLB is out of date.
+	 * This does not strictly imply that we need to flush (it's
+	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+	 * going to need to flush in the very near future, so we might
+	 * as well get it over with.
+	 *
+	 * The only question is whether to do a full or partial flush.
+	 *
+	 * We do a partial flush if requested and two extra conditions
+	 * are met:
+	 *
+	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+	 *    we've always done all needed flushes to catch up to
+	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
+	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
+	 *    us up to date for tlb_gen 3 is the partial flush we're
+	 *    processing.
+	 *
+	 *    As an example of why this check is needed, suppose that there
+	 *    are two concurrent flushes.  The first is a full flush that
+	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
+	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
+	 *    processed on this CPU in reverse order, we'll see
+	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+	 *    If we were to use __flush_tlb_one_user() and set local_tlb_gen to
+	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
+	 *    1 without the full flush that's needed for tlb_gen 2.
+	 *
+	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
+	 *    Partial TLB flushes are not all that much cheaper than full TLB
+	 *    flushes, so it seems unlikely that it would be a performance win
+	 *    to do a partial flush if that won't bring our TLB fully up to
+	 *    date.  By doing a full flush instead, we can increase
+	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+	 *    avoid another flush in the very near future.
+	 */
+	if (f->end != TLB_FLUSH_ALL &&
+	    f->new_tlb_gen == local_tlb_gen + 1 &&
+	    f->new_tlb_gen == mm_tlb_gen) {
+		/* Partial flush */
+		unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
+		unsigned long addr = f->start;
+
+		while (addr < f->end) {
+			__flush_tlb_one_user(addr);
+			addr += 1UL << f->stride_shift;
+		}
+		if (local)
+			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
+		// trace_tlb_flush(reason, nr_invalidate);
+	} else {
+		/* Full flush. */
+		local_flush_tlb();
+		if (local)
+			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+		// trace_tlb_flush(reason, TLB_FLUSH_ALL);
+	}
+
+	/* Both paths above update our state to mm_tlb_gen. */
+	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
+}
+
+static void copied_flush_tlb_func_remote(void *info)
+{
+	const struct flush_tlb_info *f = info;
+	bool saved_lazy;
+
+	inc_irq_stat(irq_tlb_count);
+
+	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
+		return;
+
+	saved_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
+	this_cpu_write(cpu_tlbstate.is_lazy, false);
+
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+	copied_flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
+
+	this_cpu_write(cpu_tlbstate.is_lazy, saved_lazy);
+}
+
+
+static void copied_native_flush_tlb_others(const struct cpumask *cpumask,
+			     const struct flush_tlb_info *info)
+{
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+
+#if 0
+	if (info->end == TLB_FLUSH_ALL)
+		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
+	else
+		trace_tlb_flush(TLB_REMOTE_SEND_IPI,
+				(info->end - info->start) >> PAGE_SHIFT);
+#endif
+	/*
+	 * Use non-UV system way in first version to reduce porting affort,
+	 * we will support UV system later if necessary
+	 */
+#if 0
+	if (is_uv_system()) {
+		/*
+		 * This whole special case is confused.  UV has a "Broadcast
+		 * Assist Unit", which seems to be a fancy way to send IPIs.
+		 * Back when x86 used an explicit TLB flush IPI, UV was
+		 * optimized to use its own mechanism.  These days, x86 uses
+		 * smp_call_function_many(), but UV still uses a manual IPI,
+		 * and that IPI's action is out of date -- it does a manual
+		 * flush instead of calling flush_tlb_func_remote().  This
+		 * means that the percpu tlb_gen variables won't be updated
+		 * and we'll do pointless flushes on future context switches.
+		 *
+		 * Rather than hooking native_flush_tlb_others() here, I think
+		 * that UV should be updated so that smp_call_function_many(),
+		 * etc, are optimal on UV.
+		 */
+		unsigned int cpu;
+
+		cpu = smp_processor_id();
+		cpumask = uv_flush_tlb_others(cpumask, info);
+		if (cpumask)
+			smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
+					       (void *)info, 1);
+		return;
+	}
+#endif
+
+	/*
+	 * If no page tables were freed, we can skip sending IPIs to
+	 * CPUs in lazy TLB mode. They will flush the CPU themselves
+	 * at the next context switch.
+	 *
+	 * However, if page tables are getting freed, we need to send the
+	 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
+	 * up on the new contents of what used to be page tables, while
+	 * doing a speculative memory access.
+	 */
+	if (info->freed_tables)
+		smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
+			       (void *)info, 1);
+	else
+		on_each_cpu_cond_mask(copied_tlb_is_not_lazy, copied_flush_tlb_func_remote,
+				(void *)info, 1, GFP_ATOMIC, cpumask);
+}
+
+
+void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, unsigned int stride_shift,
+				bool freed_tables)
+{
+	int cpu;
+
+	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
+		.mm = mm,
+		.stride_shift = stride_shift,
+		.freed_tables = freed_tables,
+	};
+
+	cpu = get_cpu();
+
+	/* This is also a barrier that synchronizes with switch_mm(). */
+	info.new_tlb_gen = inc_mm_tlb_gen(mm);
+
+	/* Should we flush just the requested range? */
+	if ((end != TLB_FLUSH_ALL) &&
+	    ((end - start) >> stride_shift) <= copied_tlb_single_page_flush_ceiling) {
+		info.start = start;
+		info.end = end;
+	} else {
+		info.start = 0UL;
+		info.end = TLB_FLUSH_ALL;
+	}
+
+	/* This should never happend in our case */
+	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+		VM_WARN_ON(irqs_disabled());
+		local_irq_disable();
+		copied_flush_tlb_func_common(&info, true, TLB_LOCAL_MM_SHOOTDOWN);
+		local_irq_enable();
+	}
+
+	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+		copied_native_flush_tlb_others(mm_cpumask(mm), &info);
+
+	put_cpu();
+}
+
--- a/tlb_flush.h
+++ b/tlb_flush.h
+#ifndef _TLB_FLUSH_H
+#define _TLB_FLUSH_H
+
+#include <asm/tlbflush.h>
+
+void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+                               unsigned long end, unsigned int stride_shift,
+                               bool freed_tables);
+
+#endif