提交 1087d6c6 编写于 作者: O openeuler-ci-bot 提交者: Gitee
obj-m := kvm_ept_idle.o
kvm_ept_idle-y := ept_idle.o ept_idle_native_pagewalk.o tlb_flush.o
include Kbuild
# KERNEL_SRC_DIR=/lib/modules/$(shell uname -r)/build
# MODULE_DIR=$(shell pwd)
default:
$(MAKE) -C $(KERNEL_SRC_DIR) M=$(MODULE_DIR) modules
clean:
rm -f *.o *.ko *.mod.c modules.order Module.symvers
# memory-scan
#### Description
A kernel module for scaning page table of process/VMs
#### Software Architecture
Software architecture description
memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel_module. memory-scan is a kernel module for scaning page table of process/VMs.
#### Installation
1. xxxx
2. xxxx
3. xxxx
#### Instructions
1. xxxx
2. xxxx
3. xxxx
OpenEuler users can use memory-scan:
1. make
2. insmod memory_scan.ko
#### Contribution
......
# memory-scan
#### 介绍
A kernel module for scaning page table of process/VMs
#### 软件架构
软件架构说明
memory-scan源于memory-optimizer(https://github.com/intel/memory-optimizer/tree/master/kernel_module)。memory-scan是一个用于扫描进程或虚拟机的内存页面访问情况的内核模块。
#### 安装教程
1. xxxx
2. xxxx
3. xxxx
#### 使用说明
1. xxxx
2. xxxx
3. xxxx
OpenEuler用户可以通过以下方法使用memory-scan:
1. make
2. insmod memory_scan.ko
#### 参与贡献
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/bitmap.h>
#include <linux/sched/mm.h>
#include <linux/version.h>
#include <linux/module.h>
#include <asm/tlbflush.h>
#include <linux/fdtable.h>
#include "ept_idle.h"
#include "ept_idle_native_pagewalk.h"
#include "tlb_flush.h"
/* #define DEBUG 1 */
/*
Fallback to false for kernel doens't support KVM_INVALID_SPTE
ept_idle can sitll work in this situation but the scan accuracy may drop, depends on
the access frequences of the workload.
*/
#ifdef KVM_INVALID_SPTE
#define KVM_CHECK_INVALID_SPTE(val) (val) == KVM_INVALID_SPTE
#else
#define KVM_CHECK_INVALID_SPTE(val) (0)
#endif
#if LINUX_VERSION_CODE == KERNEL_VERSION(4, 17, 0)
# define pgtable_l5_enabled() (pgtable_l5_enabled)
#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)
# define pgtable_l5_enabled() (0)
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0)
# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu)
/*For RedHat 7.7 beta*/
#elif LINUX_VERSION_CODE == KERNEL_VERSION(3, 10, 0)
# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu)
#else
# define kvm_arch_mmu_pointer(vcpu) (&vcpu->arch.mmu)
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0)
# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled)
/*For RedHat 7.7 beta*/
#elif LINUX_VERSION_CODE == KERNEL_VERSION(3, 10, 0)
# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled)
#else
# define kvm_mmu_ad_disabled(mmu) (mmu->base_role.ad_disabled)
#endif
#ifdef DEBUG
#define debug_printk trace_printk
#define set_restart_gpa(val, note) ({ \
unsigned long old_val = eic->restart_gpa; \
eic->restart_gpa = (val); \
trace_printk("restart_gpa=%lx %luK %s %s %d\n", \
(val), (eic->restart_gpa - old_val) >> 10, \
note, __func__, __LINE__); \
})
#define set_next_hva(val, note) ({ \
unsigned long old_val = eic->next_hva; \
eic->next_hva = (val); \
trace_printk(" next_hva=%lx %luK %s %s %d\n", \
(val), (eic->next_hva - old_val) >> 10, \
note, __func__, __LINE__); \
})
#else
#define debug_printk(...)
#define set_restart_gpa(val, note) ({ \
eic->restart_gpa = (val); \
})
#define set_next_hva(val, note) ({ \
eic->next_hva = (val); \
})
#endif
static struct proc_dir_entry* dir_entry;
static unsigned long pagetype_size[16] = {
[PTE_ACCESSED] = PAGE_SIZE, /* 4k page */
[PMD_ACCESSED] = PMD_SIZE, /* 2M page */
[PUD_PRESENT] = PUD_SIZE, /* 1G page */
[PTE_DIRTY] = PAGE_SIZE,
[PMD_DIRTY] = PMD_SIZE,
[PTE_IDLE] = PAGE_SIZE,
[PMD_IDLE] = PMD_SIZE,
[PMD_IDLE_PTES] = PMD_SIZE,
[PTE_HOLE] = PAGE_SIZE,
[PMD_HOLE] = PMD_SIZE,
};
static void u64_to_u8(uint64_t n, uint8_t *p)
{
p += sizeof(uint64_t) - 1;
*p-- = n; n >>= 8;
*p-- = n; n >>= 8;
*p-- = n; n >>= 8;
*p-- = n; n >>= 8;
*p-- = n; n >>= 8;
*p-- = n; n >>= 8;
*p-- = n; n >>= 8;
*p = n;
}
static void dump_eic(struct ept_idle_ctrl *eic)
{
debug_printk("ept_idle_ctrl: pie_read=%d pie_read_max=%d buf_size=%d "
"bytes_copied=%d next_hva=%lx restart_gpa=%lx "
"gpa_to_hva=%lx\n",
eic->pie_read,
eic->pie_read_max,
eic->buf_size,
eic->bytes_copied,
eic->next_hva,
eic->restart_gpa,
eic->gpa_to_hva);
}
static void eic_report_addr(struct ept_idle_ctrl *eic, unsigned long addr)
{
unsigned long hva;
eic->kpie[eic->pie_read++] = PIP_CMD_SET_HVA;
hva = addr;
u64_to_u8(hva, &eic->kpie[eic->pie_read]);
eic->pie_read += sizeof(uint64_t);
debug_printk("eic_report_addr %lx\n", addr);
dump_eic(eic);
}
static int eic_add_page(struct ept_idle_ctrl *eic,
unsigned long addr,
unsigned long next,
enum ProcIdlePageType page_type)
{
int page_size = pagetype_size[page_type];
debug_printk("eic_add_page addr=%lx next=%lx "
"page_type=%d pagesize=%dK\n",
addr, next, (int)page_type, (int)page_size >> 10);
dump_eic(eic);
/* align kernel/user vision of cursor position */
next = round_up(next, page_size);
if (!eic->pie_read ||
addr + eic->gpa_to_hva != eic->next_hva) {
/* merge hole */
if (page_type == PTE_HOLE ||
page_type == PMD_HOLE) {
set_restart_gpa(next, "PTE_HOLE|PMD_HOLE");
return 0;
}
if (addr + eic->gpa_to_hva < eic->next_hva) {
debug_printk("ept_idle: addr moves backwards\n");
WARN_ONCE(1, "ept_idle: addr moves backwards");
}
if (eic->pie_read + sizeof(uint64_t) + 2 >= eic->pie_read_max) {
set_restart_gpa(addr, "EPT_IDLE_KBUF_FULL");
return EPT_IDLE_KBUF_FULL;
}
eic_report_addr(eic, round_down(addr, page_size) +
eic->gpa_to_hva);
} else {
if (PIP_TYPE(eic->kpie[eic->pie_read - 1]) == page_type &&
PIP_SIZE(eic->kpie[eic->pie_read - 1]) < 0xF) {
set_next_hva(next + eic->gpa_to_hva, "IN-PLACE INC");
set_restart_gpa(next, "IN-PLACE INC");
eic->kpie[eic->pie_read - 1]++;
WARN_ONCE(page_size < next-addr, "next-addr too large");
return 0;
}
if (eic->pie_read >= eic->pie_read_max) {
set_restart_gpa(addr, "EPT_IDLE_KBUF_FULL");
return EPT_IDLE_KBUF_FULL;
}
}
set_next_hva(next + eic->gpa_to_hva, "NEW-ITEM");
set_restart_gpa(next, "NEW-ITEM");
eic->kpie[eic->pie_read] = PIP_COMPOSE(page_type, 1);
eic->pie_read++;
return 0;
}
// Borrowed fronm zhou, jianshi <jianshi.zhou@intel.com> and modified by yy, thanks to jianshi.
static int get_mm_and_kvm_by_pid(pid_t nr,
struct mm_struct** mmp,
struct kvm** kvmp)
{
struct task_struct* task;
struct files_struct* files;
struct kvm* kvm = NULL;
struct mm_struct* mm = NULL;
struct pid* pid;
int fd, max_fds;
rcu_read_lock();
if(!(pid = find_vpid(nr))) {
rcu_read_unlock();
printk(KERN_ERR"failed to get vpid for pid = %d\n", nr);
return -ESRCH;
}
if(!(task = pid_task(pid, PIDTYPE_PID))){
rcu_read_unlock();
printk(KERN_ERR"failed to get task_struct for pid = %d\n", nr);
return -ESRCH;
}
// kthread has no mm_struct*
mm = get_task_mm(task);
if (!mm) {
rcu_read_unlock();
printk(KERN_ERR"faild to get mm_struct for pid = %d\n", nr);
return -ESRCH;
}
files = task->files;
max_fds = files_fdtable(files)->max_fds;
for(fd = 0; fd < max_fds; fd++) {
struct file* file;
char buffer[32];
char* fname;
if(!(file = fcheck_files(files, fd)))
continue;
fname = d_path(&(file->f_path), buffer, sizeof(buffer));
if(fname < buffer || fname >= buffer + sizeof(buffer))
continue;
if(strcmp(fname, "anon_inode:kvm-vm") == 0) {
kvm = file->private_data;
if (kvm)
kvm_get_kvm(kvm);
break;
}
}
rcu_read_unlock();
*kvmp = kvm;
*mmp = mm;
return 0;
}
static int ept_pte_range(struct ept_idle_ctrl *eic,
pmd_t *pmd, unsigned long addr, unsigned long end)
{
pte_t *pte;
enum ProcIdlePageType page_type;
int err = 0;
pte = pte_offset_kernel(pmd, addr);
do {
if (KVM_CHECK_INVALID_SPTE(pte->pte)) {
page_type = PTE_IDLE;
} else if (!ept_pte_present(*pte))
page_type = PTE_HOLE;
else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED,
(unsigned long *) &pte->pte))
page_type = PTE_IDLE;
else {
page_type = PTE_ACCESSED;
if (eic->flags & SCAN_DIRTY_PAGE) {
if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY,
(unsigned long *) &pte->pte))
page_type = PTE_DIRTY;
}
}
err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type);
if (err)
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
return err;
}
static int ept_pmd_range(struct ept_idle_ctrl *eic,
pud_t *pud, unsigned long addr, unsigned long end)
{
pmd_t *pmd;
unsigned long next;
enum ProcIdlePageType page_type;
enum ProcIdlePageType pte_page_type;
int err = 0;
if (eic->flags & SCAN_HUGE_PAGE)
pte_page_type = PMD_IDLE_PTES;
else
pte_page_type = IDLE_PAGE_TYPE_MAX;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) {
page_type = PMD_IDLE;
} else if (!ept_pmd_present(*pmd))
page_type = PMD_HOLE; /* likely won't hit here */
else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED,
(unsigned long *)pmd)) {
if (pmd_large(*pmd))
page_type = PMD_IDLE;
else if (eic->flags & SCAN_SKIM_IDLE)
page_type = PMD_IDLE_PTES;
else
page_type = pte_page_type;
} else if (pmd_large(*pmd)) {
page_type = PMD_ACCESSED;
if (eic->flags & SCAN_DIRTY_PAGE) {
if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY,
(unsigned long *) pmd))
page_type = PMD_DIRTY;
}
} else
page_type = pte_page_type;
if (page_type != IDLE_PAGE_TYPE_MAX)
err = eic_add_page(eic, addr, next, page_type);
else
err = ept_pte_range(eic, pmd, addr, next);
if (err)
break;
} while (pmd++, addr = next, addr != end);
return err;
}
static int ept_pud_range(struct ept_idle_ctrl *eic,
p4d_t *p4d, unsigned long addr, unsigned long end)
{
pud_t *pud;
unsigned long next;
int err = 0;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (!ept_pud_present(*pud)) {
set_restart_gpa(next, "PUD_HOLE");
continue;
}
if (pud_large(*pud))
err = eic_add_page(eic, addr, next, PUD_PRESENT);
else
err = ept_pmd_range(eic, pud, addr, next);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
static int ept_p4d_range(struct ept_idle_ctrl *eic,
pgd_t *pgd, unsigned long addr, unsigned long end)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (!ept_p4d_present(*p4d)) {
set_restart_gpa(next, "P4D_HOLE");
continue;
}
err = ept_pud_range(eic, p4d, addr, next);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}
static int ept_page_range(struct ept_idle_ctrl *eic,
unsigned long addr,
unsigned long end)
{
struct kvm_vcpu *vcpu;
struct kvm_mmu *mmu;
pgd_t *ept_root;
pgd_t *pgd;
unsigned long next;
int err = 0;
BUG_ON(addr >= end);
spin_lock(&eic->kvm->mmu_lock);
vcpu = kvm_get_vcpu(eic->kvm, 0);
if (!vcpu) {
spin_unlock(&eic->kvm->mmu_lock);
return -EINVAL;
}
mmu = kvm_arch_mmu_pointer(vcpu);
if (!VALID_PAGE(mmu->root_hpa)) {
spin_unlock(&eic->kvm->mmu_lock);
return -EINVAL;
}
ept_root = __va(mmu->root_hpa);
spin_unlock(&eic->kvm->mmu_lock);
local_irq_disable();
pgd = pgd_offset_pgd(ept_root, addr);
do {
next = pgd_addr_end(addr, end);
if (!ept_pgd_present(*pgd)) {
set_restart_gpa(next, "PGD_HOLE");
continue;
}
err = ept_p4d_range(eic, pgd, addr, next);
if (err)
break;
} while (pgd++, addr = next, addr != end);
local_irq_enable();
return err;
}
static int init_ept_idle_ctrl_buffer(struct ept_idle_ctrl *eic)
{
eic->pie_read = 0;
eic->pie_read_max = min(EPT_IDLE_KBUF_SIZE,
eic->buf_size - eic->bytes_copied);
/* reserve space for PIP_CMD_SET_HVA in the end */
eic->pie_read_max -= sizeof(uint64_t) + 1;
/*
* Align with EPT_IDLE_KBUF_FULL
* logic in eic_add_page(), to avoid eic->pie_read = 0 when
* EPT_IDLE_KBUF_FULL happened.
*/
if (eic->pie_read_max <= sizeof(uint64_t) + 2)
return EPT_IDLE_KBUF_FULL;
memset(eic->kpie, 0, sizeof(eic->kpie));
return 0;
}
static void setup_ept_idle_ctrl(struct ept_idle_ctrl *eic, void* buf,
int buf_size, unsigned int flags)
{
eic->buf = buf;
eic->buf_size = buf_size;
eic->bytes_copied = 0;
eic->next_hva = 0;
eic->gpa_to_hva = 0;
eic->restart_gpa = 0;
eic->last_va = 0;
eic->flags = flags;
}
static int ept_idle_copy_user(struct ept_idle_ctrl *eic,
unsigned long start, unsigned long end)
{
int bytes_read;
int lc = 0; /* last copy? */
int ret;
debug_printk("ept_idle_copy_user %lx %lx\n", start, end);
dump_eic(eic);
/* Break out of loop on no more progress. */
if (!eic->pie_read) {
lc = 1;
if (start < end)
start = end;
}
if (start >= end && start > eic->next_hva) {
set_next_hva(start, "TAIL-HOLE");
eic_report_addr(eic, start);
}
bytes_read = eic->pie_read;
if (!bytes_read)
return 1;
ret = copy_to_user(eic->buf, eic->kpie, bytes_read);
if (ret)
return -EFAULT;
eic->buf += bytes_read;
eic->bytes_copied += bytes_read;
if (eic->bytes_copied >= eic->buf_size)
return EPT_IDLE_BUF_FULL;
if (lc)
return lc;
ret = init_ept_idle_ctrl_buffer(eic);
if (ret)
return ret;
cond_resched();
return 0;
}
/*
* Depending on whether hva falls in a memslot:
*
* 1) found => return gpa and remaining memslot size in *addr_range
*
* |<----- addr_range --------->|
* [ mem slot ]
* ^hva
*
* 2) not found => return hole size in *addr_range
*
* |<----- addr_range --------->|
* [ first mem slot above hva ]
* ^hva
*
* If hva is above all mem slots, *addr_range will be ~0UL. We can finish read(2).
*/
static unsigned long ept_idle_find_gpa(struct ept_idle_ctrl *eic,
unsigned long hva,
unsigned long *addr_range)
{
struct kvm *kvm = eic->kvm;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
unsigned long hva_end;
gfn_t gfn;
*addr_range = ~0UL;
mutex_lock(&kvm->slots_lock);
slots = kvm_memslots(eic->kvm);
kvm_for_each_memslot(memslot, slots) {
hva_end = memslot->userspace_addr +
(memslot->npages << PAGE_SHIFT);
if (hva >= memslot->userspace_addr && hva < hva_end) {
gpa_t gpa;
gfn = hva_to_gfn_memslot(hva, memslot);
*addr_range = hva_end - hva;
gpa = gfn_to_gpa(gfn);
debug_printk("ept_idle_find_gpa slot %lx=>%llx %lx=>%llx "
"delta %llx size %lx\n",
memslot->userspace_addr,
gfn_to_gpa(memslot->base_gfn),
hva, gpa,
hva - gpa,
memslot->npages << PAGE_SHIFT);
mutex_unlock(&kvm->slots_lock);
return gpa;
}
if (memslot->userspace_addr > hva)
*addr_range = min(*addr_range,
memslot->userspace_addr - hva);
}
mutex_unlock(&kvm->slots_lock);
return INVALID_PAGE;
}
static int ept_idle_supports_cpu(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
struct kvm_mmu *mmu;
int ret;
vcpu = kvm_get_vcpu(kvm, 0);
if (!vcpu)
return -EINVAL;
spin_lock(&kvm->mmu_lock);
mmu = kvm_arch_mmu_pointer(vcpu);
if (kvm_mmu_ad_disabled(mmu)) {
printk(KERN_NOTICE
"CPU does not support EPT A/D bits tracking\n");
ret = -EINVAL;
} else if (mmu->shadow_root_level != 4 + (! !pgtable_l5_enabled())) {
printk(KERN_NOTICE "Unsupported EPT level %d\n",
mmu->shadow_root_level);
ret = -EINVAL;
} else
ret = 0;
spin_unlock(&kvm->mmu_lock);
return ret;
}
static int ept_idle_walk_hva_range(struct ept_idle_ctrl *eic,
unsigned long start, unsigned long end)
{
unsigned long gpa_addr;
unsigned long addr_range;
unsigned long va_end;
int ret;
ret = ept_idle_supports_cpu(eic->kvm);
if (ret)
return ret;
ret = init_ept_idle_ctrl_buffer(eic);
if (ret)
return ret;
for (; start < end;) {
gpa_addr = ept_idle_find_gpa(eic, start, &addr_range);
if (gpa_addr == INVALID_PAGE) {
eic->gpa_to_hva = 0;
if (addr_range == ~0UL) /* beyond max virtual address */ {
set_restart_gpa(TASK_SIZE, "EOF");
va_end = end;
} else {
start += addr_range;
set_restart_gpa(start, "OUT-OF-SLOT");
va_end = start;
}
} else {
eic->gpa_to_hva = start - gpa_addr;
ept_page_range(eic, gpa_addr, gpa_addr + addr_range);
va_end = eic->gpa_to_hva + gpa_addr + addr_range;
}
start = eic->restart_gpa + eic->gpa_to_hva;
ret = ept_idle_copy_user(eic, start, va_end);
if (ret)
break;
}
if (eic->bytes_copied)
ret = 0;
return ret;
}
static ssize_t mm_idle_read(struct file *file, char *buf,
size_t count, loff_t *ppos);
static ssize_t ept_idle_read(struct file *file, char *buf,
size_t count, loff_t *ppos)
{
struct ept_idle_ctrl *eic = file->private_data;
unsigned long hva_start = *ppos;
unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT));
int ret;
if (!eic) {
printk(KERN_ERR"NULL eic instance\n");
return -ENOMEM;
}
if (hva_start >= TASK_SIZE) {
debug_printk("ept_idle_read past TASK_SIZE: %lx %lx\n",
hva_start, TASK_SIZE);
return 0;
}
if (!eic->mm)
return -EINVAL;
if (!eic->kvm)
return mm_idle_read(file, buf, count, ppos);
if (hva_end <= hva_start) {
debug_printk("ept_idle_read past EOF: %lx %lx\n",
hva_start, hva_end);
return 0;
}
if (*ppos & (PAGE_SIZE - 1)) {
debug_printk("ept_idle_read unaligned ppos: %lx\n",
hva_start);
return -EINVAL;
}
if (count < EPT_IDLE_BUF_MIN) {
debug_printk("ept_idle_read small count: %lx\n",
(unsigned long)count);
return -EINVAL;
}
setup_ept_idle_ctrl(eic, buf, count, file->f_flags);
ret = ept_idle_walk_hva_range(eic, hva_start, hva_end);
if (ret)
goto out_kvm;
ret = eic->bytes_copied;
*ppos = eic->next_hva;
debug_printk("ppos=%lx bytes_copied=%d\n",
eic->next_hva, ret);
out_kvm:
return ret;
}
static int ept_idle_open(struct inode *inode, struct file *file)
{
struct ept_idle_ctrl* eic;
if (!try_module_get(THIS_MODULE)) {
file->private_data = NULL;
return -EBUSY;
}
eic = kzalloc(sizeof(*eic), GFP_KERNEL);
file->private_data = eic;
if (!eic) {
printk(KERN_ERR"Failed to alloc ept_idle_ctrl \n");
return -ENOMEM;
}
return 0;
}
static int ept_idle_release(struct inode *inode, struct file *file)
{
struct kvm *kvm;
struct ept_idle_ctrl* eic = file->private_data;
int ret = 0;
if (!eic)
goto out;
if (eic->kvm) {
kvm = eic->kvm;
spin_lock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
kvm_put_kvm(kvm);
} else if (eic->mm) {
copied_flush_tlb_mm_range(eic->mm, 0UL, TLB_FLUSH_ALL, 0UL, true);
}
if (eic->mm)
mmput(eic->mm);
kfree(eic);
out:
module_put(THIS_MODULE);
return ret;
}
static int mm_idle_pte_range(struct ept_idle_ctrl *eic, pmd_t *pmd,
unsigned long addr, unsigned long next)
{
enum ProcIdlePageType page_type;
pte_t *pte;
int err = 0;
pte = pte_offset_kernel(pmd, addr);
do {
if (!pte_present(*pte))
page_type = PTE_HOLE;
else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &pte->pte))
page_type = PTE_IDLE;
else {
page_type = PTE_ACCESSED;
}
err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type);
if (err)
break;
} while (pte++, addr += PAGE_SIZE, addr != next);
return err;
}
static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ept_idle_ctrl *eic = walk->private;
enum ProcIdlePageType page_type;
enum ProcIdlePageType pte_page_type;
int err;
/*
* Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary,
* walk_page_range() can call on the same PMD twice.
*/
if ((addr & PMD_MASK) == (eic->last_va & PMD_MASK)) {
debug_printk("ignore duplicate addr %lx %lx\n",
addr, eic->last_va);
return 0;
}
eic->last_va = addr;
if (eic->flags & SCAN_HUGE_PAGE)
pte_page_type = PMD_IDLE_PTES;
else
pte_page_type = IDLE_PAGE_TYPE_MAX;
#if 0
if (!pmd_present(*pmd))
page_type = PMD_HOLE;
else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmd)) {
if (pmd_large(*pmd))
page_type = PMD_IDLE;
else if (eic->flags & SCAN_SKIM_IDLE)
page_type = PMD_IDLE_PTES;
else
page_type = pte_page_type;
} else if (pmd_large(*pmd)) {
page_type = PMD_ACCESSED;
} else
page_type = pte_page_type;
#else
// don't clear A bit in PMD for 4K page, which conflicted with pmd_bad()
if (!pmd_present(*pmd))
page_type = PMD_HOLE;
else if (!pmd_large(*pmd))
page_type = pte_page_type;
else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmd))
page_type = PMD_IDLE;
else
page_type = PMD_ACCESSED;
#endif
if (page_type != IDLE_PAGE_TYPE_MAX)
err = eic_add_page(eic, addr, next, page_type);
else
err = mm_idle_pte_range(eic, pmd, addr, next);
return err;
}
static int mm_idle_pud_entry(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ept_idle_ctrl *eic = walk->private;
if ((addr & PUD_MASK) != (eic->last_va & PUD_MASK)) {
eic_add_page(eic, addr, next, PUD_PRESENT);
eic->last_va = addr;
}
return 1;
}
static int mm_idle_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
if (vma->vm_file) {
if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE)
return 0;
return 1;
}
return 0;
}
static int mm_idle_walk_range(struct ept_idle_ctrl *eic,
unsigned long start,
unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma;
int ret;
ret = init_ept_idle_ctrl_buffer(eic);
if (ret)
return ret;
for (; start < end;)
{
down_read(&walk->mm->mmap_sem);
vma = find_vma(walk->mm, start);
if (vma) {
if (end > vma->vm_start) {
local_irq_disable();
ret = ept_idle_walk_page_range(start, end, walk);
local_irq_enable();
} else
set_restart_gpa(vma->vm_start, "VMA-HOLE");
} else
set_restart_gpa(TASK_SIZE, "EOF");
up_read(&walk->mm->mmap_sem);
WARN_ONCE(eic->gpa_to_hva, "non-zero gpa_to_hva");
start = eic->restart_gpa;
ret = ept_idle_copy_user(eic, start, end);
if (ret)
break;
}
if (eic->bytes_copied) {
if (ret != EPT_IDLE_BUF_FULL && eic->next_hva < end)
debug_printk("partial scan: next_hva=%lx end=%lx\n",
eic->next_hva, end);
ret = 0;
} else
WARN_ONCE(1, "nothing read");
return ret;
}
static ssize_t mm_idle_read(struct file *file, char *buf,
size_t count, loff_t *ppos)
{
struct ept_idle_ctrl *eic = file->private_data;
struct mm_walk mm_walk = {};
unsigned long va_start = *ppos;
unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT));
int ret;
if (va_end <= va_start) {
debug_printk("mm_idle_read past EOF: %lx %lx\n",
va_start, va_end);
return 0;
}
if (*ppos & (PAGE_SIZE - 1)) {
debug_printk("mm_idle_read unaligned ppos: %lx\n",
va_start);
return -EINVAL;
}
if (count < EPT_IDLE_BUF_MIN) {
debug_printk("mm_idle_read small count: %lx\n",
(unsigned long)count);
return -EINVAL;
}
setup_ept_idle_ctrl(eic, buf, count, file->f_flags);
mm_walk.mm = eic->mm;
mm_walk.pmd_entry = mm_idle_pmd_entry;
mm_walk.pud_entry = mm_idle_pud_entry;
mm_walk.test_walk = mm_idle_test_walk;
mm_walk.private = eic;
ret = mm_idle_walk_range(eic, va_start, va_end, &mm_walk);
if (ret)
goto out_mm;
ret = eic->bytes_copied;
*ppos = eic->next_hva;
debug_printk("ppos=%lx bytes_copied=%d\n",
eic->next_hva, ret);
out_mm:
return ret;
}
// copied from fs/proc/base.c mem_lseek
static loff_t ept_idle_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
force_successful_syscall_return();
return file->f_pos;
}
static long ept_idle_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct ept_idle_ctrl* eic;
pid_t target_pid = (pid_t)arg;
long ret;
eic = filp->private_data;
if (!eic) {
printk(KERN_ERR"NULL eic instance \n");
return -ENOMEM;
}
switch(ioctl) {
case IDLE_PAGE_SET_PID:
ret = get_mm_and_kvm_by_pid(target_pid, &eic->mm, &eic->kvm);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
struct file_operations proc_idle_page_oprations = {
.llseek = ept_idle_lseek,
.read = ept_idle_read,
.open = ept_idle_open,
.release = ept_idle_release,
.unlocked_ioctl = ept_idle_ioctl
};
static int ept_idle_entry(void)
{
dir_entry = proc_create("idle_pages", S_IWUSR | S_IRUGO, NULL,
&proc_idle_page_oprations);
if (!dir_entry) {
printk("Failed to create idle_pages in /porc\n");
return -ENOMEM;
}
return 0;
}
static void ept_idle_exit(void)
{
if (dir_entry)
proc_remove(dir_entry);
}
MODULE_LICENSE("GPL");
module_init(ept_idle_entry);
module_exit(ept_idle_exit);
#ifndef _EPT_IDLE_H
#define _EPT_IDLE_H
#include "ept_idle_common.h"
#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */
#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */
#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */
enum ProcIdlePageType {
PTE_ACCESSED, /* 4k page */
PMD_ACCESSED, /* 2M page */
PUD_PRESENT, /* 1G page */
PTE_DIRTY,
PMD_DIRTY,
PTE_IDLE,
PMD_IDLE,
PMD_IDLE_PTES, /* all PTE idle */
PTE_HOLE,
PMD_HOLE,
PIP_CMD,
IDLE_PAGE_TYPE_MAX
};
#define PIP_TYPE(a) (0xf & (a >> 4))
#define PIP_SIZE(a) (0xf & a)
#define PIP_COMPOSE(type, nr) ((type << 4) | nr)
#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0)
#define _PAGE_BIT_EPT_ACCESSED 8
#define _PAGE_BIT_EPT_DIRTY 9
#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED)
#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY)
#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7))
static inline int ept_pte_present(pte_t a)
{
return pte_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pmd_present(pmd_t a)
{
return pmd_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pud_present(pud_t a)
{
return pud_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_p4d_present(p4d_t a)
{
return p4d_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pgd_present(pgd_t a)
{
return pgd_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pte_accessed(pte_t a)
{
return pte_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pmd_accessed(pmd_t a)
{
return pmd_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pud_accessed(pud_t a)
{
return pud_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_p4d_accessed(p4d_t a)
{
return p4d_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pgd_accessed(pgd_t a)
{
return pgd_flags(a) & _PAGE_EPT_ACCESSED;
}
extern struct file_operations proc_ept_idle_operations;
#define EPT_IDLE_KBUF_FULL 1
#define EPT_IDLE_BUF_FULL 2
#define EPT_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3)
#define EPT_IDLE_KBUF_SIZE 8000
#define IDLE_PAGE_SET_PID _IOW(0x1, 0x1, pid_t)
struct ept_idle_ctrl {
struct mm_struct *mm;
struct kvm *kvm;
uint8_t kpie[EPT_IDLE_KBUF_SIZE];
int pie_read;
int pie_read_max;
void __user *buf;
int buf_size;
int bytes_copied;
unsigned long next_hva; /* GPA for EPT; VA for PT */
unsigned long gpa_to_hva;
unsigned long restart_gpa;
unsigned long last_va;
unsigned int flags;
};
#endif
// SPDX-License-Identifier: GPL-2.0
#ifndef _EPT_IDLE_COMMON_H
#define _EPT_IDLE_COMMON_H
/* Fix leak of 5 level paging supporting on old kernel*/
#ifndef CONFIG_PGTABLE_LEVELS
#define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#else
#if CONFIG_PGTABLE_LEVELS < 4
#define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#endif // #if CONFIG_PGTABLE_LEVELS < 4
#endif // #ifndef CONFIG_PGTABLE_LEVELS
#ifdef EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#define p4d_t pgd_t
#define p4d_flags pgd_flags
#define p4d_offset(pgd, start) (pgd)
#define p4d_addr_end(addr, end) (end)
#define p4d_present(p4d) 1
#define p4d_ERROR(p4d) do { } while(0)
#define p4d_clear pgd_clear
#define p4d_none(p4d) 0
#define p4d_bad(p4d) 0
#define p4d_clear_bad pgd_clear_bad
#endif
#ifndef pgd_offset_pgd
#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
#endif
#endif
// SPDX-License-Identifier: GPL-2.0
// Copied from kernel mm/pagewalk.c, modified by yuan.yao@intel.com
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
#include "ept_idle_common.h"
#ifdef CONFIG_HUGETLB_PAGE
int pmd_huge(pmd_t pmd)
{
return !pmd_none(pmd) &&
(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
}
int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
/*
* ept_idle_huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
* Return: Pointer to page table or swap entry (PUD or PMD) for
* address @addr, or NULL if a p*d_none() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
pte_t *ept_idle_huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
return NULL;
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
if (sz != PUD_SIZE && pud_none(*pud))
return NULL;
/* hugepage or swap? */
if (pud_huge(*pud) || !pud_present(*pud))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
if (sz != PMD_SIZE && pmd_none(*pmd))
return NULL;
/* hugepage or swap? */
if (pmd_huge(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
return NULL;
}
#else // #ifdef CONFIG_HUGETLB_PAGE
#define pud_huge(x) 0
#define pmd_huge(x) 0
#define ept_idle_huge_pte_offset(mm, address, sz) 0
#endif
#ifndef VM_BUG_ON_VMA
#define VM_BUG_ON_VMA(cond, vma) \
do { \
if (unlikely(cond)) { \
BUG(); \
} \
} while (0)
#endif
#ifndef VM_BUG_ON_MM
#define VM_BUG_ON_MM VM_BUG_ON_VMA
#endif
static inline int ept_idle_p4d_none_or_clear_bad(p4d_t *p4d)
{
if (p4d_none(*p4d))
return 1;
if (unlikely(p4d_bad(*p4d))) {
p4d_clear_bad(p4d);
return 1;
}
return 0;
}
static inline spinlock_t *ept_idle_pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
spinlock_t *ptl;
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
ptl = pud_lock(vma->vm_mm, pud);
if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
return ptl;
spin_unlock(ptl);
return NULL;
}
void p4d_clear_bad(p4d_t *p4d)
{
p4d_ERROR(*p4d);
p4d_clear(p4d);
}
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
pmd_clear(pmd);
}
#ifdef _EPT_IDLE_SPLIT_PMD_
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pte_t *pte;
int err = 0;
pte = pte_offset_map(pmd, addr);
for (;;) {
err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
addr += PAGE_SIZE;
if (addr == end)
break;
pte++;
}
pte_unmap(pte);
return err;
}
#endif
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pmd_t *pmd;
unsigned long next;
int err = 0;
pmd = pmd_offset(pud, addr);
do {
#ifdef _EPT_IDLE_SPLIT_PMD_
again:
#endif
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
/*
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
*/
if (walk->pmd_entry)
err = walk->pmd_entry(pmd, addr, next, walk);
if (err)
break;
#ifdef _EPT_IDLE_SPLIT_PMD_
/*
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
if (!walk->pte_entry)
continue;
split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
#endif
} while (pmd++, addr = next, addr != end);
return err;
}
static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pud_t *pud;
unsigned long next;
int err = 0;
pud = pud_offset(p4d, addr);
do {
#ifdef _EPT_IDLE_SPLIT_PUD_
again:
#endif
next = pud_addr_end(addr, end);
if (pud_none(*pud) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pud_entry) {
spinlock_t *ptl = ept_idle_pud_trans_huge_lock(pud, walk->vma);
if (ptl) {
err = walk->pud_entry(pud, addr, next, walk);
spin_unlock(ptl);
if (err)
break;
continue;
}
}
#ifdef _EPT_IDLE_SPLIT_PUD_
split_huge_pud(walk->vma, pud, addr);
if (pud_none(*pud))
goto again;
#endif
if (walk->pmd_entry || walk->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (ept_idle_p4d_none_or_clear_bad(p4d)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pmd_entry || walk->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}
static int walk_pgd_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pgd_t *pgd;
unsigned long next;
int err = 0;
pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pmd_entry || walk->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
} while (pgd++, addr = next, addr != end);
return err;
}
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
unsigned long end)
{
unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
return boundary < end ? boundary : end;
}
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
unsigned long sz = huge_page_size(h);
pte_t *pte;
int err = 0;
do {
next = hugetlb_entry_end(h, addr, end);
pte = ept_idle_huge_pte_offset(walk->mm, addr & hmask, sz);
if (pte)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
else if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
} while (addr = next, addr != end);
return err;
}
#else /* CONFIG_HUGETLB_PAGE */
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
return 0;
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
* Decide whether we really walk over the current vma on [@start, @end)
* or skip it via the returned value. Return 0 if we do walk over the
* current vma, and return 1 if we skip the vma. Negative values means
* error, where we abort the current walk.
*/
static int walk_page_test(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
if (walk->test_walk)
return walk->test_walk(start, end, walk);
/*
* vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
* range, so we don't walk over it as we do for normal vmas. However,
* Some callers are interested in handling hole range and they don't
* want to just ignore any single address range. Such users certainly
* define their ->pte_hole() callbacks, so let's delegate them to handle
* vma(VM_PFNMAP).
*/
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
if (walk->pte_hole)
err = walk->pte_hole(start, end, walk);
return err ? err : 1;
}
return 0;
}
static int __walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
int err = 0;
struct vm_area_struct *vma = walk->vma;
if (vma && is_vm_hugetlb_page(vma)) {
if (walk->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
return err;
}
/**
* walk_page_range - walk page table with caller specific callbacks
* @start: start address of the virtual address range
* @end: end address of the virtual address range
* @walk: mm_walk structure defining the callbacks and the target address space
*
* Recursively walk the page table tree of the process represented by @walk->mm
* within the virtual address range [@start, @end). During walking, we can do
* some caller-specific works for each entry, by setting up pmd_entry(),
* pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
* callbacks, the associated entries/pages are just ignored.
* The return values of these callbacks are commonly defined like below:
*
* - 0 : succeeded to handle the current entry, and if you don't reach the
* end address yet, continue to walk.
* - >0 : succeeded to handle the current entry, and return to the caller
* with caller specific value.
* - <0 : failed to handle the current entry, and return to the caller
* with error code.
*
* Before starting to walk page table, some callers want to check whether
* they really want to walk over the current vma, typically by checking
* its vm_flags. walk_page_test() and @walk->test_walk() are used for this
* purpose.
*
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
* caller-specific data to callbacks, @walk->private should be helpful.
*
* Locking:
* Callers of walk_page_range() and walk_page_vma() should hold
* @walk->mm->mmap_sem, because these function traverse vma list and/or
* access to vma's data.
*/
int ept_idle_walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
int err = 0;
unsigned long next;
struct vm_area_struct *vma;
if (start >= end)
return -EINVAL;
if (!walk->mm)
return -EINVAL;
VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
vma = find_vma(walk->mm, start);
do {
if (!vma) { /* after the last vma */
walk->vma = NULL;
next = end;
} else if (start < vma->vm_start) { /* outside vma */
walk->vma = NULL;
next = min(end, vma->vm_start);
} else { /* inside vma */
walk->vma = vma;
next = min(end, vma->vm_end);
vma = vma->vm_next;
err = walk_page_test(start, next, walk);
if (err > 0) {
/*
* positive return values are purely for
* controlling the pagewalk, so should never
* be passed to the callers.
*/
err = 0;
continue;
}
if (err < 0)
break;
}
if (walk->vma || walk->pte_hole)
err = __walk_page_range(start, next, walk);
if (err)
break;
} while (start = next, start < end);
return err;
}
#ifndef _EPT_IDLE_NATIVE_PAGEWALK_H
#define _EPT_IDLE_NATIVE_PAGEWALK_H
int ept_idle_walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk);
#endif
#include "tlb_flush.h"
/* copied from 4.20 kernel:
* See Documentation/x86/tlb.txt for details. We choose 33
* because it is large enough to cover the vast majority (at
* least 95%) of allocations, and is small enough that we are
* confident it will not cause too much overhead. Each single
* flush is about 100 ns, so this caps the maximum overhead at
* _about_ 3,000 ns.
*
* This is in units of pages.
*/
static unsigned long copied_tlb_single_page_flush_ceiling __read_mostly = 33;
static bool copied_tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
/*
* flush_tlb_func_common()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
* read active_mm's tlb_gen. We don't need any explicit barriers
* because all x86 flush operations are serializing and the
* atomic64_read operation won't be reordered by the compiler.
*/
static void copied_flush_tlb_func_common(const struct flush_tlb_info *f,
bool local, enum tlb_flush_reason reason)
{
/*
* We have three different tlb_gen values in here. They are:
*
* - mm_tlb_gen: the latest generation.
* - local_tlb_gen: the generation that this CPU has already caught
* up to.
* - f->new_tlb_gen: the generation that the requester of the flush
* wants us to catch up to.
*/
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
/*
* The init_mm is unexported variable, but we don't need
* check this here for our case, we just want to flush
* the TLB on remote CPU cores which is running the task
* using f->mm as memory space
*/
#if 0
if (unlikely(loaded_mm == &init_mm))
return;
#else
if (unlikely(loaded_mm != f->mm)) {
return;
}
#endif
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
/*
* The caller of this function will set is_lazy to false explicitly
* so we don't need handle this case, just skip this.
*/
#if 0
if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
#endif
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
/*
* There's nothing to do: we're already up to date. This can
* happen if two concurrent flushes happen -- the first flush to
* be handled can catch us all the way up, leaving no work for
* the second flush.
*/
// trace_tlb_flush(reason, 0);
return;
}
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
/*
* If we get to this point, we know that our TLB is out of date.
* This does not strictly imply that we need to flush (it's
* possible that f->new_tlb_gen <= local_tlb_gen), but we're
* going to need to flush in the very near future, so we might
* as well get it over with.
*
* The only question is whether to do a full or partial flush.
*
* We do a partial flush if requested and two extra conditions
* are met:
*
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
* we've always done all needed flushes to catch up to
* local_tlb_gen. If, for example, local_tlb_gen == 2 and
* f->new_tlb_gen == 3, then we know that the flush needed to bring
* us up to date for tlb_gen 3 is the partial flush we're
* processing.
*
* As an example of why this check is needed, suppose that there
* are two concurrent flushes. The first is a full flush that
* changes context.tlb_gen from 1 to 2. The second is a partial
* flush that changes context.tlb_gen from 2 to 3. If they get
* processed on this CPU in reverse order, we'll see
* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
* 3, we'd be break the invariant: we'd update local_tlb_gen above
* 1 without the full flush that's needed for tlb_gen 2.
*
* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
* Partial TLB flushes are not all that much cheaper than full TLB
* flushes, so it seems unlikely that it would be a performance win
* to do a partial flush if that won't bring our TLB fully up to
* date. By doing a full flush instead, we can increase
* local_tlb_gen all the way to mm_tlb_gen and we can probably
* avoid another flush in the very near future.
*/
if (f->end != TLB_FLUSH_ALL &&
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
// trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
// trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
/* Both paths above update our state to mm_tlb_gen. */
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
static void copied_flush_tlb_func_remote(void *info)
{
const struct flush_tlb_info *f = info;
bool saved_lazy;
inc_irq_stat(irq_tlb_count);
if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
return;
saved_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
this_cpu_write(cpu_tlbstate.is_lazy, false);
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
copied_flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
this_cpu_write(cpu_tlbstate.is_lazy, saved_lazy);
}
static void copied_native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
#if 0
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
#endif
/*
* Use non-UV system way in first version to reduce porting affort,
* we will support UV system later if necessary
*/
#if 0
if (is_uv_system()) {
/*
* This whole special case is confused. UV has a "Broadcast
* Assist Unit", which seems to be a fancy way to send IPIs.
* Back when x86 used an explicit TLB flush IPI, UV was
* optimized to use its own mechanism. These days, x86 uses
* smp_call_function_many(), but UV still uses a manual IPI,
* and that IPI's action is out of date -- it does a manual
* flush instead of calling flush_tlb_func_remote(). This
* means that the percpu tlb_gen variables won't be updated
* and we'll do pointless flushes on future context switches.
*
* Rather than hooking native_flush_tlb_others() here, I think
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
unsigned int cpu;
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
(void *)info, 1);
return;
}
#endif
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables)
smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
(void *)info, 1);
else
on_each_cpu_cond_mask(copied_tlb_is_not_lazy, copied_flush_tlb_func_remote,
(void *)info, 1, GFP_ATOMIC, cpumask);
}
void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu();
/* This is also a barrier that synchronizes with switch_mm(). */
info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
((end - start) >> stride_shift) <= copied_tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
info.start = 0UL;
info.end = TLB_FLUSH_ALL;
}
/* This should never happend in our case */
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
VM_WARN_ON(irqs_disabled());
local_irq_disable();
copied_flush_tlb_func_common(&info, true, TLB_LOCAL_MM_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
copied_native_flush_tlb_others(mm_cpumask(mm), &info);
put_cpu();
}
#ifndef _TLB_FLUSH_H
#define _TLB_FLUSH_H
#include <asm/tlbflush.h>
void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册